mirror of
https://github.com/huggingface/lerobot.git
synced 2026-07-01 07:07:08 +00:00
Enable MolmoAct2 rollout on SO-100/101 with calibration correction (#3879)
* fix(rollout): improve visual feature mismatch error with --rename_map hint * feat(policies): add joint frame transform and hardware deployment docs for MolmoAct2 Add MolmoAct2StateFrameTransformStep and MolmoAct2ActionFrameTransformStep processor steps for cross-calibration compatibility on SO-100/101. Add joint_signs and joint_offsets config fields. Add hardware deployment section to molmoact2.mdx with camera naming convention, joint frame correction, and safety guidance. * chore(docs): address PR comment * fix: address reviewer comments
This commit is contained in:
@@ -386,6 +386,68 @@ These results demonstrate MolmoAct2's strong performance across diverse robotic
|
||||
manipulation tasks. To reproduce them, follow the instructions in the LIBERO
|
||||
evaluation section.
|
||||
|
||||
## Hardware Deployment (lerobot-rollout)
|
||||
|
||||
LeRobot-format checkpoints are available on the Hub for direct use with
|
||||
`lerobot-rollout`. Each checkpoint uses specific camera names that must
|
||||
match your robot's camera configuration.
|
||||
|
||||
### Camera naming convention
|
||||
|
||||
Each checkpoint expects specific `observation.images.*` keys.
|
||||
If your robot cameras have different names, use `--rename_map` to map them:
|
||||
|
||||
| Checkpoint | Camera keys | Description |
|
||||
| ----------------------------- | ---------------------- | ------------------------ |
|
||||
| MolmoAct2-LIBERO-LeRobot | `image`, `wrist_image` | LIBERO sim cameras |
|
||||
| MolmoAct2-BimanualYAM-LeRobot | `top`, `left`, `right` | YAM 3-camera setup |
|
||||
| MolmoAct2-DROID-LeRobot | `cam0`, `cam1` | External + wrist |
|
||||
| MolmoAct2-SO100_101-LeRobot | `cam0`, `cam1` | Primary + secondary view |
|
||||
|
||||
Example with an SO-100 robot using top and side cameras:
|
||||
|
||||
```bash
|
||||
lerobot-rollout \
|
||||
--policy.path=lerobot/MolmoAct2-SO100_101-LeRobot \
|
||||
--rename_map='{"observation.images.top": "observation.images.cam0", "observation.images.side": "observation.images.cam1"}' \
|
||||
--robot.type=so100_follower \
|
||||
--robot.port=/dev/ttyACM0 \
|
||||
--robot.cameras='{
|
||||
top: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30},
|
||||
side: {type: opencv, index_or_path: 2, width: 640, height: 480, fps: 30}
|
||||
}' \
|
||||
--task="pick up the red cube" --duration=30
|
||||
```
|
||||
|
||||
To use a wrist camera instead, just change the rename mapping:
|
||||
|
||||
```bash
|
||||
--rename_map='{"observation.images.top": "observation.images.cam0", "observation.images.wrist": "observation.images.cam1"}'
|
||||
```
|
||||
|
||||
### Joint frame transform (SO-100/101 zero-shot)
|
||||
|
||||
<Tip warning={true}>
|
||||
The MolmoAct2-SO100_101 checkpoint was trained on data that uses a different
|
||||
joint calibration convention than LeRobot >= 0.5.0. Without a frame
|
||||
correction, the arm may move in the wrong direction.
|
||||
|
||||
This affects both **zero-shot deployment** and **fine-tuning** from the
|
||||
original checkpoint. The pretrained weights expect the old convention, so
|
||||
all joint data (observations and actions) must be transformed to match.
|
||||
|
||||
The converted LeRobot checkpoint (`lerobot/MolmoAct2-SO100_101-LeRobot`)
|
||||
already includes this correction in its processor pipeline. If you convert
|
||||
or fine-tune the checkpoint yourself, set the following in the policy config (`configuration_molmoact2.py`):
|
||||
|
||||
- `joint_signs`: `[1, -1, 1, 1, 1, 1]` (flips shoulder_lift direction)
|
||||
- `joint_offsets`: `[0, 90, 90, 0, 0, 0]` (shifts shoulder_lift and elbow_flex by 90°)
|
||||
|
||||
See the [backward compatibility guide](./backwardcomp) for details on the
|
||||
calibration change.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Differences From the Original Implementation
|
||||
|
||||
This LeRobot port is intended to match MolmoAct2 behavior while using LeRobot's
|
||||
|
||||
@@ -79,6 +79,15 @@ class MolmoAct2Config(PreTrainedConfig):
|
||||
eval_seed: int | None = None
|
||||
rtc_config: RTCConfig | None = None
|
||||
|
||||
# Joint frame transform for cross-calibration compatibility.
|
||||
# Some MolmoAct2 checkpoints were trained on data using a different joint
|
||||
# convention than the current LeRobot calibration. Set both to apply a
|
||||
# sign/offset correction at runtime (state before model, action after).
|
||||
# See: https://huggingface.co/docs/lerobot/backwardcomp
|
||||
# Default is None (no transform). Both must be set together.
|
||||
joint_signs: list[float] | None = None
|
||||
joint_offsets: list[float] | None = None
|
||||
|
||||
# Default is full finetuning with gradients from the action expert flowing into the VLM.
|
||||
enable_lora_vlm: bool = False
|
||||
lora_rank: int = 64
|
||||
@@ -123,6 +132,10 @@ class MolmoAct2Config(PreTrainedConfig):
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
if (self.joint_signs is None) != (self.joint_offsets is None):
|
||||
raise ValueError("joint_signs and joint_offsets must both be set or both be None.")
|
||||
if self.joint_signs is not None and len(self.joint_signs) != len(self.joint_offsets):
|
||||
raise ValueError("joint_signs and joint_offsets must have the same length.")
|
||||
if self.action_mode not in {"continuous", "discrete", "both"}:
|
||||
raise ValueError(
|
||||
f"Unsupported action_mode={self.action_mode!r}. "
|
||||
|
||||
@@ -1005,6 +1005,93 @@ class MolmoAct2PackInputsProcessorStep(ProcessorStep):
|
||||
return features
|
||||
|
||||
|
||||
@ProcessorStepRegistry.register(name="molmoact2_state_frame_transform")
|
||||
@dataclass
|
||||
class MolmoAct2StateFrameTransformStep(ProcessorStep):
|
||||
"""Convert robot state from arm frame to model frame before normalization.
|
||||
|
||||
Required for zero-shot deployment of MolmoAct2-SO100_101 on SO-100/101
|
||||
arms calibrated with LeRobot >= 0.5.0 (v3.0 convention). The checkpoint
|
||||
was trained on data using a different joint convention (sign flip on
|
||||
shoulder_lift, 90 deg offset on shoulder_lift and elbow_flex).
|
||||
|
||||
No-op when joint_signs and joint_offsets are None (default), so this
|
||||
step has no effect on fine-tuned models or other embodiments.
|
||||
|
||||
state_model = signs * arm_state + offsets
|
||||
|
||||
See: https://huggingface.co/docs/lerobot/backwardcomp
|
||||
"""
|
||||
|
||||
joint_signs: list[float] | None = None
|
||||
joint_offsets: list[float] | None = None
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
if self.joint_signs is None or self.joint_offsets is None:
|
||||
return transition
|
||||
observation = transition.get(TransitionKey.OBSERVATION)
|
||||
if not isinstance(observation, dict) or OBS_STATE not in observation:
|
||||
return transition
|
||||
transition = transition.copy()
|
||||
observation = observation.copy()
|
||||
state = torch.as_tensor(observation[OBS_STATE], dtype=torch.float32).clone()
|
||||
n = len(self.joint_signs)
|
||||
signs = torch.tensor(self.joint_signs, dtype=torch.float32, device=state.device)
|
||||
offsets = torch.tensor(self.joint_offsets, dtype=torch.float32, device=state.device)
|
||||
state[..., :n] = signs * state[..., :n] + offsets
|
||||
observation[OBS_STATE] = state
|
||||
transition[TransitionKey.OBSERVATION] = observation
|
||||
return transition
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
return features
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {"joint_signs": self.joint_signs, "joint_offsets": self.joint_offsets}
|
||||
|
||||
|
||||
@ProcessorStepRegistry.register(name="molmoact2_action_frame_transform")
|
||||
@dataclass
|
||||
class MolmoAct2ActionFrameTransformStep(ProcessorStep):
|
||||
"""Convert model action from model frame back to arm frame after unnormalization.
|
||||
|
||||
Inverse of MolmoAct2StateFrameTransformStep. Required for zero-shot
|
||||
MolmoAct2-SO100_101 on SO-100/101 arms. No-op when both fields are None.
|
||||
|
||||
action_arm = signs * (model_action - offsets)
|
||||
|
||||
See: https://huggingface.co/docs/lerobot/backwardcomp
|
||||
"""
|
||||
|
||||
joint_signs: list[float] | None = None
|
||||
joint_offsets: list[float] | None = None
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
if self.joint_signs is None or self.joint_offsets is None:
|
||||
return transition
|
||||
action = transition.get(TransitionKey.ACTION)
|
||||
if action is None:
|
||||
return transition
|
||||
transition = transition.copy()
|
||||
action = torch.as_tensor(action, dtype=torch.float32).clone()
|
||||
n = len(self.joint_signs)
|
||||
signs = torch.tensor(self.joint_signs, dtype=torch.float32, device=action.device)
|
||||
offsets = torch.tensor(self.joint_offsets, dtype=torch.float32, device=action.device)
|
||||
action[..., :n] = signs * (action[..., :n] - offsets)
|
||||
transition[TransitionKey.ACTION] = action
|
||||
return transition
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
return features
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {"joint_signs": self.joint_signs, "joint_offsets": self.joint_offsets}
|
||||
|
||||
|
||||
@ProcessorStepRegistry.register(name="molmoact2_clamp_action")
|
||||
@dataclass
|
||||
class MolmoAct2ClampActionProcessorStep(ProcessorStep):
|
||||
@@ -1067,6 +1154,10 @@ def make_molmoact2_pre_post_processors(
|
||||
input_steps: list[ProcessorStep] = [
|
||||
RenameObservationsProcessorStep(rename_map={}),
|
||||
AddBatchDimensionProcessorStep(),
|
||||
MolmoAct2StateFrameTransformStep(
|
||||
joint_signs=config.joint_signs,
|
||||
joint_offsets=config.joint_offsets,
|
||||
),
|
||||
MolmoAct2MaskedNormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
@@ -1102,6 +1193,10 @@ def make_molmoact2_pre_post_processors(
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=masked_dataset_stats,
|
||||
),
|
||||
MolmoAct2ActionFrameTransformStep(
|
||||
joint_signs=config.joint_signs,
|
||||
joint_offsets=config.joint_offsets,
|
||||
),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
|
||||
@@ -320,7 +320,9 @@ def build_rollout_context(
|
||||
raise ValueError(
|
||||
f"Visual feature mismatch between policy and robot hardware.\n"
|
||||
f"Policy expects: {expected_visuals}\n"
|
||||
f"Robot provides: {provided_visuals}"
|
||||
f"Robot provides: {provided_visuals}\n"
|
||||
f"Use --rename_map to map camera names, e.g. "
|
||||
f"""--rename_map='{{"observation.images.top": "observation.images.cam0"}}'"""
|
||||
)
|
||||
|
||||
# --- 5. Dataset -------------
|
||||
|
||||
@@ -44,10 +44,12 @@ from lerobot.policies.molmoact2.modeling_molmoact2 import (
|
||||
_combine_rollout_seeds,
|
||||
)
|
||||
from lerobot.policies.molmoact2.processor_molmoact2 import (
|
||||
MolmoAct2ActionFrameTransformStep,
|
||||
MolmoAct2ClampNormalizedProcessorStep,
|
||||
MolmoAct2MaskedNormalizerProcessorStep,
|
||||
MolmoAct2MaskedUnnormalizerProcessorStep,
|
||||
MolmoAct2PackInputsProcessorStep,
|
||||
MolmoAct2StateFrameTransformStep,
|
||||
_add_gripper_masks_to_stats,
|
||||
_build_discrete_state_string,
|
||||
_normalize_question_text,
|
||||
@@ -926,6 +928,39 @@ def test_question_normalization_matches_release_prompt_style():
|
||||
)
|
||||
|
||||
|
||||
def test_joint_frame_transform_round_trip():
|
||||
signs = [1.0, -1.0, 1.0, 1.0, 1.0, 1.0]
|
||||
offsets = [0.0, 90.0, 90.0, 0.0, 0.0, 0.0]
|
||||
original_state = torch.tensor([[10.0, -90.0, -120.0, 30.0, 0.0, -45.0]])
|
||||
|
||||
state_step = MolmoAct2StateFrameTransformStep(joint_signs=signs, joint_offsets=offsets)
|
||||
action_step = MolmoAct2ActionFrameTransformStep(joint_signs=signs, joint_offsets=offsets)
|
||||
|
||||
transition = {
|
||||
TransitionKey.OBSERVATION: {OBS_STATE: original_state.clone()},
|
||||
}
|
||||
transformed = state_step(transition)
|
||||
model_state = transformed[TransitionKey.OBSERVATION][OBS_STATE]
|
||||
|
||||
action_transition = {TransitionKey.ACTION: model_state.clone()}
|
||||
recovered = action_step(action_transition)
|
||||
recovered_state = recovered[TransitionKey.ACTION]
|
||||
|
||||
assert torch.allclose(recovered_state, original_state)
|
||||
|
||||
|
||||
def test_joint_frame_transform_noop_when_none():
|
||||
state_step = MolmoAct2StateFrameTransformStep(joint_signs=None, joint_offsets=None)
|
||||
action_step = MolmoAct2ActionFrameTransformStep(joint_signs=None, joint_offsets=None)
|
||||
state = torch.tensor([[10.0, -90.0, -120.0]])
|
||||
|
||||
state_transition = {TransitionKey.OBSERVATION: {OBS_STATE: state}}
|
||||
assert state_step(state_transition) is state_transition
|
||||
|
||||
action_transition = {TransitionKey.ACTION: state}
|
||||
assert action_step(action_transition) is action_transition
|
||||
|
||||
|
||||
def test_action_padding_marks_only_real_dimensions():
|
||||
step = object.__new__(MolmoAct2PackInputsProcessorStep)
|
||||
step.max_action_dim = 32
|
||||
|
||||
Reference in New Issue
Block a user