diff --git a/docs/source/molmoact2.mdx b/docs/source/molmoact2.mdx index 9a377031c..9eb449ca9 100644 --- a/docs/source/molmoact2.mdx +++ b/docs/source/molmoact2.mdx @@ -386,6 +386,68 @@ These results demonstrate MolmoAct2's strong performance across diverse robotic manipulation tasks. To reproduce them, follow the instructions in the LIBERO evaluation section. +## Hardware Deployment (lerobot-rollout) + +LeRobot-format checkpoints are available on the Hub for direct use with +`lerobot-rollout`. Each checkpoint uses specific camera names that must +match your robot's camera configuration. + +### Camera naming convention + +Each checkpoint expects specific `observation.images.*` keys. +If your robot cameras have different names, use `--rename_map` to map them: + +| Checkpoint | Camera keys | Description | +| ----------------------------- | ---------------------- | ------------------------ | +| MolmoAct2-LIBERO-LeRobot | `image`, `wrist_image` | LIBERO sim cameras | +| MolmoAct2-BimanualYAM-LeRobot | `top`, `left`, `right` | YAM 3-camera setup | +| MolmoAct2-DROID-LeRobot | `cam0`, `cam1` | External + wrist | +| MolmoAct2-SO100_101-LeRobot | `cam0`, `cam1` | Primary + secondary view | + +Example with an SO-100 robot using top and side cameras: + +```bash +lerobot-rollout \ + --policy.path=lerobot/MolmoAct2-SO100_101-LeRobot \ + --rename_map='{"observation.images.top": "observation.images.cam0", "observation.images.side": "observation.images.cam1"}' \ + --robot.type=so100_follower \ + --robot.port=/dev/ttyACM0 \ + --robot.cameras='{ + top: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}, + side: {type: opencv, index_or_path: 2, width: 640, height: 480, fps: 30} + }' \ + --task="pick up the red cube" --duration=30 +``` + +To use a wrist camera instead, just change the rename mapping: + +```bash +--rename_map='{"observation.images.top": "observation.images.cam0", "observation.images.wrist": "observation.images.cam1"}' +``` + +### Joint frame transform (SO-100/101 zero-shot) + + +The MolmoAct2-SO100_101 checkpoint was trained on data that uses a different +joint calibration convention than LeRobot >= 0.5.0. Without a frame +correction, the arm may move in the wrong direction. + +This affects both **zero-shot deployment** and **fine-tuning** from the +original checkpoint. The pretrained weights expect the old convention, so +all joint data (observations and actions) must be transformed to match. + +The converted LeRobot checkpoint (`lerobot/MolmoAct2-SO100_101-LeRobot`) +already includes this correction in its processor pipeline. If you convert +or fine-tune the checkpoint yourself, set the following in the policy config (`configuration_molmoact2.py`): + +- `joint_signs`: `[1, -1, 1, 1, 1, 1]` (flips shoulder_lift direction) +- `joint_offsets`: `[0, 90, 90, 0, 0, 0]` (shifts shoulder_lift and elbow_flex by 90°) + +See the [backward compatibility guide](./backwardcomp) for details on the +calibration change. + + + ## Differences From the Original Implementation This LeRobot port is intended to match MolmoAct2 behavior while using LeRobot's diff --git a/src/lerobot/policies/molmoact2/configuration_molmoact2.py b/src/lerobot/policies/molmoact2/configuration_molmoact2.py index 53aefdee6..bf9437ba9 100644 --- a/src/lerobot/policies/molmoact2/configuration_molmoact2.py +++ b/src/lerobot/policies/molmoact2/configuration_molmoact2.py @@ -79,6 +79,15 @@ class MolmoAct2Config(PreTrainedConfig): eval_seed: int | None = None rtc_config: RTCConfig | None = None + # Joint frame transform for cross-calibration compatibility. + # Some MolmoAct2 checkpoints were trained on data using a different joint + # convention than the current LeRobot calibration. Set both to apply a + # sign/offset correction at runtime (state before model, action after). + # See: https://huggingface.co/docs/lerobot/backwardcomp + # Default is None (no transform). Both must be set together. + joint_signs: list[float] | None = None + joint_offsets: list[float] | None = None + # Default is full finetuning with gradients from the action expert flowing into the VLM. enable_lora_vlm: bool = False lora_rank: int = 64 @@ -123,6 +132,10 @@ class MolmoAct2Config(PreTrainedConfig): def __post_init__(self) -> None: super().__post_init__() + if (self.joint_signs is None) != (self.joint_offsets is None): + raise ValueError("joint_signs and joint_offsets must both be set or both be None.") + if self.joint_signs is not None and len(self.joint_signs) != len(self.joint_offsets): + raise ValueError("joint_signs and joint_offsets must have the same length.") if self.action_mode not in {"continuous", "discrete", "both"}: raise ValueError( f"Unsupported action_mode={self.action_mode!r}. " diff --git a/src/lerobot/policies/molmoact2/processor_molmoact2.py b/src/lerobot/policies/molmoact2/processor_molmoact2.py index 1303e94a1..d2db817ef 100644 --- a/src/lerobot/policies/molmoact2/processor_molmoact2.py +++ b/src/lerobot/policies/molmoact2/processor_molmoact2.py @@ -1005,6 +1005,93 @@ class MolmoAct2PackInputsProcessorStep(ProcessorStep): return features +@ProcessorStepRegistry.register(name="molmoact2_state_frame_transform") +@dataclass +class MolmoAct2StateFrameTransformStep(ProcessorStep): + """Convert robot state from arm frame to model frame before normalization. + + Required for zero-shot deployment of MolmoAct2-SO100_101 on SO-100/101 + arms calibrated with LeRobot >= 0.5.0 (v3.0 convention). The checkpoint + was trained on data using a different joint convention (sign flip on + shoulder_lift, 90 deg offset on shoulder_lift and elbow_flex). + + No-op when joint_signs and joint_offsets are None (default), so this + step has no effect on fine-tuned models or other embodiments. + + state_model = signs * arm_state + offsets + + See: https://huggingface.co/docs/lerobot/backwardcomp + """ + + joint_signs: list[float] | None = None + joint_offsets: list[float] | None = None + + def __call__(self, transition: EnvTransition) -> EnvTransition: + if self.joint_signs is None or self.joint_offsets is None: + return transition + observation = transition.get(TransitionKey.OBSERVATION) + if not isinstance(observation, dict) or OBS_STATE not in observation: + return transition + transition = transition.copy() + observation = observation.copy() + state = torch.as_tensor(observation[OBS_STATE], dtype=torch.float32).clone() + n = len(self.joint_signs) + signs = torch.tensor(self.joint_signs, dtype=torch.float32, device=state.device) + offsets = torch.tensor(self.joint_offsets, dtype=torch.float32, device=state.device) + state[..., :n] = signs * state[..., :n] + offsets + observation[OBS_STATE] = state + transition[TransitionKey.OBSERVATION] = observation + return transition + + def transform_features( + self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] + ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: + return features + + def get_config(self) -> dict[str, Any]: + return {"joint_signs": self.joint_signs, "joint_offsets": self.joint_offsets} + + +@ProcessorStepRegistry.register(name="molmoact2_action_frame_transform") +@dataclass +class MolmoAct2ActionFrameTransformStep(ProcessorStep): + """Convert model action from model frame back to arm frame after unnormalization. + + Inverse of MolmoAct2StateFrameTransformStep. Required for zero-shot + MolmoAct2-SO100_101 on SO-100/101 arms. No-op when both fields are None. + + action_arm = signs * (model_action - offsets) + + See: https://huggingface.co/docs/lerobot/backwardcomp + """ + + joint_signs: list[float] | None = None + joint_offsets: list[float] | None = None + + def __call__(self, transition: EnvTransition) -> EnvTransition: + if self.joint_signs is None or self.joint_offsets is None: + return transition + action = transition.get(TransitionKey.ACTION) + if action is None: + return transition + transition = transition.copy() + action = torch.as_tensor(action, dtype=torch.float32).clone() + n = len(self.joint_signs) + signs = torch.tensor(self.joint_signs, dtype=torch.float32, device=action.device) + offsets = torch.tensor(self.joint_offsets, dtype=torch.float32, device=action.device) + action[..., :n] = signs * (action[..., :n] - offsets) + transition[TransitionKey.ACTION] = action + return transition + + def transform_features( + self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] + ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: + return features + + def get_config(self) -> dict[str, Any]: + return {"joint_signs": self.joint_signs, "joint_offsets": self.joint_offsets} + + @ProcessorStepRegistry.register(name="molmoact2_clamp_action") @dataclass class MolmoAct2ClampActionProcessorStep(ProcessorStep): @@ -1067,6 +1154,10 @@ def make_molmoact2_pre_post_processors( input_steps: list[ProcessorStep] = [ RenameObservationsProcessorStep(rename_map={}), AddBatchDimensionProcessorStep(), + MolmoAct2StateFrameTransformStep( + joint_signs=config.joint_signs, + joint_offsets=config.joint_offsets, + ), MolmoAct2MaskedNormalizerProcessorStep( features={**config.input_features, **config.output_features}, norm_map=config.normalization_mapping, @@ -1102,6 +1193,10 @@ def make_molmoact2_pre_post_processors( norm_map=config.normalization_mapping, stats=masked_dataset_stats, ), + MolmoAct2ActionFrameTransformStep( + joint_signs=config.joint_signs, + joint_offsets=config.joint_offsets, + ), DeviceProcessorStep(device="cpu"), ] diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py index 62d844932..20a7d715a 100644 --- a/src/lerobot/rollout/context.py +++ b/src/lerobot/rollout/context.py @@ -320,7 +320,9 @@ def build_rollout_context( raise ValueError( f"Visual feature mismatch between policy and robot hardware.\n" f"Policy expects: {expected_visuals}\n" - f"Robot provides: {provided_visuals}" + f"Robot provides: {provided_visuals}\n" + f"Use --rename_map to map camera names, e.g. " + f"""--rename_map='{{"observation.images.top": "observation.images.cam0"}}'""" ) # --- 5. Dataset ------------- diff --git a/tests/policies/molmoact2/test_molmoact2.py b/tests/policies/molmoact2/test_molmoact2.py index 5fba72913..095b73180 100644 --- a/tests/policies/molmoact2/test_molmoact2.py +++ b/tests/policies/molmoact2/test_molmoact2.py @@ -44,10 +44,12 @@ from lerobot.policies.molmoact2.modeling_molmoact2 import ( _combine_rollout_seeds, ) from lerobot.policies.molmoact2.processor_molmoact2 import ( + MolmoAct2ActionFrameTransformStep, MolmoAct2ClampNormalizedProcessorStep, MolmoAct2MaskedNormalizerProcessorStep, MolmoAct2MaskedUnnormalizerProcessorStep, MolmoAct2PackInputsProcessorStep, + MolmoAct2StateFrameTransformStep, _add_gripper_masks_to_stats, _build_discrete_state_string, _normalize_question_text, @@ -926,6 +928,39 @@ def test_question_normalization_matches_release_prompt_style(): ) +def test_joint_frame_transform_round_trip(): + signs = [1.0, -1.0, 1.0, 1.0, 1.0, 1.0] + offsets = [0.0, 90.0, 90.0, 0.0, 0.0, 0.0] + original_state = torch.tensor([[10.0, -90.0, -120.0, 30.0, 0.0, -45.0]]) + + state_step = MolmoAct2StateFrameTransformStep(joint_signs=signs, joint_offsets=offsets) + action_step = MolmoAct2ActionFrameTransformStep(joint_signs=signs, joint_offsets=offsets) + + transition = { + TransitionKey.OBSERVATION: {OBS_STATE: original_state.clone()}, + } + transformed = state_step(transition) + model_state = transformed[TransitionKey.OBSERVATION][OBS_STATE] + + action_transition = {TransitionKey.ACTION: model_state.clone()} + recovered = action_step(action_transition) + recovered_state = recovered[TransitionKey.ACTION] + + assert torch.allclose(recovered_state, original_state) + + +def test_joint_frame_transform_noop_when_none(): + state_step = MolmoAct2StateFrameTransformStep(joint_signs=None, joint_offsets=None) + action_step = MolmoAct2ActionFrameTransformStep(joint_signs=None, joint_offsets=None) + state = torch.tensor([[10.0, -90.0, -120.0]]) + + state_transition = {TransitionKey.OBSERVATION: {OBS_STATE: state}} + assert state_step(state_transition) is state_transition + + action_transition = {TransitionKey.ACTION: state} + assert action_step(action_transition) is action_transition + + def test_action_padding_marks_only_real_dimensions(): step = object.__new__(MolmoAct2PackInputsProcessorStep) step.max_action_dim = 32