feat(hub): add pretrained_revision to pin Hub model versions

- Add pretrained_revision field to PreTrainedConfig (policies) and RewardModelConfig (reward models), and thread it through make_policy(), make_pre_post_processors(), and make_reward_model() so that weights and processor configs can be loaded from a specific Hub commit, branch, or tag. Defaults to None (latest version, preserving current behavior). Dataset and env hub loading already supported revision pinning.
2026-06-16 15:57:03 +00:00 · 2026-06-16 15:34:56 +02:00
5 changed files with 10 additions and 0 deletions
@@ -79,6 +79,8 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):  # type: igno
    # Either the repo ID of a model hosted on the Hub or a path to a directory containing weights
    # saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch.
    pretrained_path: Path | None = None
+    # Optional Hub revision (commit hash, branch, or tag) to pin the pretrained model version.
+    pretrained_revision: str | None = None

    def __post_init__(self) -> None:
        if not self.device or not is_torch_device_available(self.device):
@@ -56,6 +56,8 @@ class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
    device: str | None = None

    pretrained_path: str | None = None
+    # Optional Hub revision (commit hash, branch, or tag) to pin the pretrained reward model version.
+    pretrained_revision: str | None = None

    push_to_hub: bool = False
    repo_id: str | None = None
@@ -252,6 +252,7 @@ class ProcessorConfigKwargs(TypedDict, total=False):
 def make_pre_post_processors(
    policy_cfg: PreTrainedConfig,
    pretrained_path: str | None = None,
+    pretrained_revision: str | None = None,
    **kwargs: Unpack[ProcessorConfigKwargs],
 ) -> tuple[
    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
@@ -309,6 +310,7 @@ def make_pre_post_processors(
            overrides=kwargs.get("preprocessor_overrides", {}),
            to_transition=batch_to_transition,
            to_output=transition_to_batch,
+            revision=pretrained_revision,
        )
        postprocessor = PolicyProcessorPipeline.from_pretrained(
            pretrained_model_name_or_path=pretrained_path,
@@ -318,6 +320,7 @@ def make_pre_post_processors(
            overrides=kwargs.get("postprocessor_overrides", {}),
            to_transition=policy_action_to_transition,
            to_output=transition_to_policy_action,
+            revision=pretrained_revision,
        )
        _reconnect_relative_absolute_steps(preprocessor, postprocessor)
        return preprocessor, postprocessor
@@ -557,6 +560,7 @@ def make_policy(
        # Load a pretrained policy and override the config if needed (for example, if there are inference-time
        # hyperparameters that we want to vary).
        kwargs["pretrained_name_or_path"] = cfg.pretrained_path
+        kwargs["revision"] = cfg.pretrained_revision
        policy = policy_cls.from_pretrained(**kwargs)
    elif cfg.pretrained_path and cfg.use_peft:
        # Load a pretrained PEFT model on top of the policy. The pretrained path points to the folder/repo
@@ -124,6 +124,7 @@ def make_reward_model(cfg: RewardModelConfig, **kwargs) -> PreTrainedRewardModel

    if cfg.pretrained_path:
        kwargs["pretrained_name_or_path"] = cfg.pretrained_path
+        kwargs["revision"] = cfg.pretrained_revision
        reward_model = reward_cls.from_pretrained(**kwargs)
    else:
        reward_model = reward_cls(**kwargs)
@@ -345,6 +345,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        preprocessor, postprocessor = make_pre_post_processors(
            policy_cfg=cfg.policy,
            pretrained_path=processor_pretrained_path,
+            pretrained_revision=getattr(cfg.policy, "pretrained_revision", None),
            **processor_kwargs,
        )