Fix pi0fast model id in docs (#3855 )

fix(deps): pin cmeel-urdfdom <5 and cmeel-tinyxml2 <11 in placo-dep (#3873 )
placo pulls in pin (Pinocchio), whose binary wheels dlopen specific cmeel sonames (liburdfdom_sensor.so.4.0, libtinyxml2.so.10) but declare only `>=` floors on their cmeel packages. The 2026-05-21 major bumps (cmeel-urdfdom 6.0.0 -> .so.6, cmeel-tinyxml2 11.0.0 -> .so.11) ship newer sonames, so left unpinned the resolver grabs them and `import placo` fails at load with "liburdfdom_sensor.so.4.0: cannot open shared object file". #3647 capped placo and hardened the kinematics import, but the guard only defers the failure: constructing RobotKinematics still raises. Pin the cmeel packages to the 4.x / 10.x ABI the placo/pin wheels are built against (there is no cmeel-urdfdom 5.x; <5 selects 4.x). Regenerated uv.lock with uv 0.8.0 to match CI; the only resolution change is the two cmeel versions (plus a deterministic decord platform-marker cascade from 4.0.1's wider wheel set). Fixes #3755
2026-06-24 11:47:17 +00:00 · 2026-06-24 11:44:03 +02:00 · 2026-06-24 11:23:25 +02:00
18 changed files with 738 additions and 626 deletions
@@ -17,7 +17,7 @@ the paper, see [allenai/molmoact2](https://github.com/allenai/molmoact2).
 Install LeRobot with the MolmoAct2 optional dependencies:

 ```bash
-uv sync --locked --extra molmoact2
+pip install -e ".[molmoact2]"
 ```

 To run the models in this repository, you need an NVIDIA GPU. The measurements
@@ -46,8 +46,8 @@ The repo has been tested with Ubuntu 22.04.

 To use MolmoAct2 in a LeRobot training config, set:

-```bash
--policy.type=molmoact2
+```python
+policy.type=molmoact2
 ```

 ## Training
@@ -96,7 +96,7 @@ lerobot-train \
    --policy.type=pi0_fast \
    --output_dir=./outputs/pi0fast_training \
    --job_name=pi0fast_training \
-    --policy.pretrained_path=lerobot/pi0_fast_base \
+    --policy.pretrained_path=lerobot/pi0fast-base \
    --policy.dtype=bfloat16 \
    --policy.gradient_checkpointing=true \
    --policy.chunk_size=10 \
@@ -187,7 +187,7 @@ lerobot-train \
  --dataset.repo_id=lerobot/libero \
  --output_dir=outputs/libero_pi0fast \
  --job_name=libero_pi0fast \
-  --policy.path=lerobot/pi0fast_base \
+  --policy.path=lerobot/pi0fast-base \
  --policy.dtype=bfloat16 \
  --steps=100000 \
  --save_freq=20000 \
@@ -140,7 +140,14 @@ av-dep = ["av>=15.0.0,<16.0.0"]
 pygame-dep = ["pygame>=2.5.1,<2.7.0"]
 # NOTE: 0.9.16 links against liburdfdom_sensor.so.4, which is unavailable on Ubuntu 24.04
 # (noble ships urdfdom 3.x). Cap below 0.9.16 until system urdfdom 4.x is broadly available.
-placo-dep = ["placo>=0.9.6,<0.9.16"]
+#
+# NOTE: placo pulls in pin (Pinocchio), whose binary wheels dlopen specific cmeel sonames
+# (liburdfdom_sensor.so.4.0, libtinyxml2.so.10) but declare only `>=` floors on their cmeel
+# packages. The 2026-05-21 major bumps (cmeel-urdfdom 6.0.0 -> .so.6, cmeel-tinyxml2 11.0.0
+# -> .so.11) ship newer sonames, so left unpinned the resolver grabs them and `import placo`
+# fails at load with "liburdfdom_sensor.so.4.0: cannot open shared object file" (see #3755).
+# There is no cmeel-urdfdom 5.x; <5 selects the 4.x ABI the placo/pin wheels are built against.
+placo-dep = ["placo>=0.9.6,<0.9.16", "cmeel-urdfdom>=4,<5", "cmeel-tinyxml2<11"]
 transformers-dep = ["transformers>=5.4.0,<5.6.0"]
 grpcio-dep = ["grpcio>=1.73.1,<2.0.0", "protobuf>=6.31.1,<8.0.0"]
 accelerate-dep = ["accelerate>=1.14.0,<2.0.0"]
@@ -1 +1 @@
-../../../../docs/source/molmoact2.mdx
+../../../../docs/source/policy_molmoact2_README.md
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,9 +16,16 @@

 from __future__ import annotations

+import json
+import math
+import os
+from contextlib import suppress
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any

+from huggingface_hub import snapshot_download
+
 from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature, PreTrainedConfig
 from lerobot.optim import (
    AdamWConfig,
@@ -28,6 +37,146 @@ from lerobot.utils.constants import ACTION, OBS_STATE

 from ..rtc.configuration_rtc import RTCConfig

+MOLMOACT2_DEFAULT_NUM_IMAGES = 2
+MOLMOACT2_IMAGE_TOKENS_PER_IMAGE = 196
+MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET = 80
+MOLMOACT2_TASK_TOKEN_BUDGET = 32
+MOLMOACT2_SEQUENCE_LENGTH_MARGIN = 32
+MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE = 64
+MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS = 4
+MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP = 6
+MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM = 0.95
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_checkpoint_location(
+    checkpoint_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    checkpoint_path = str(checkpoint_path or "").strip()
+    if not checkpoint_path:
+        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
+    local_path = Path(checkpoint_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=checkpoint_path,
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+def _load_hf_norm_metadata_for_tag(
+    checkpoint_path: str,
+    *,
+    revision: str | None,
+    force_download: bool,
+    norm_tag: str | None,
+) -> dict[str, Any]:
+    norm_tag = str(norm_tag or "").strip()
+    if not norm_tag:
+        return {}
+    checkpoint_location = Path(
+        _resolve_checkpoint_location(
+            checkpoint_path,
+            revision=revision,
+            force_download=force_download,
+        )
+    )
+    norm_stats_filename = "norm_stats.json"
+    config_path = checkpoint_location / "config.json"
+    if config_path.exists():
+        with suppress(OSError, json.JSONDecodeError):
+            norm_stats_filename = str(
+                json.loads(config_path.read_text()).get("norm_stats_filename") or norm_stats_filename
+            )
+    stats_path = checkpoint_location / norm_stats_filename
+    if not stats_path.exists():
+        raise FileNotFoundError(
+            f"MolmoAct2 HF checkpoint is missing {norm_stats_filename!r}; cannot resolve norm_tag={norm_tag!r}."
+        )
+    payload = json.loads(stats_path.read_text())
+    metadata_by_tag = payload.get("metadata_by_tag")
+    if not isinstance(metadata_by_tag, dict):
+        raise ValueError(f"MolmoAct2 norm stats file {stats_path} has no metadata_by_tag mapping.")
+    metadata = metadata_by_tag.get(norm_tag)
+    if not isinstance(metadata, dict):
+        available = sorted(str(tag) for tag in metadata_by_tag)
+        raise ValueError(f"Unknown MolmoAct2 norm_tag={norm_tag!r}. Available tags: {available}.")
+    return metadata
+
+
+@LRSchedulerConfig.register_subclass("molmoact2_cosine_decay_with_warmup")
+@dataclass
+class MolmoAct2CosineDecayWithWarmupSchedulerConfig(CosineDecayWithWarmupSchedulerConfig):
+    """MolmoAct2-local cosine scheduler with optional decay-step auto-match.
+
+    LeRobot's generic cosine scheduler keeps an explicit integer decay length.
+    For MolmoAct2, leaving num_decay_steps unset means "decay across this run's
+    training steps"; build() is the first point where num_training_steps is known.
+    """
+
+    num_decay_steps: int | None
+
+    def build(self, optimizer, num_training_steps: int):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.peak_lr,
+            decay_lr=self.decay_lr,
+            num_warmup_steps=self.num_warmup_steps,
+            num_decay_steps=num_training_steps if self.num_decay_steps is None else self.num_decay_steps,
+        ).build(optimizer, num_training_steps=num_training_steps)
+
+
+def _round_up(value: int, multiple: int) -> int:
+    return int(math.ceil(value / multiple) * multiple)
+
+
+def infer_molmoact2_max_sequence_length(
+    *,
+    num_images: int,
+    state_dim: int,
+    action_dim: int,
+    action_horizon: int,
+    include_discrete_action: bool,
+) -> int:
+    """Infer the padded text/image sequence cap from MolmoAct2's fixed token layout."""
+    if num_images < 1:
+        num_images = MOLMOACT2_DEFAULT_NUM_IMAGES
+    if state_dim < 0:
+        state_dim = 0
+    if action_dim < 1:
+        action_dim = 1
+    if action_horizon < 1:
+        action_horizon = 1
+
+    image_tokens = num_images * MOLMOACT2_IMAGE_TOKENS_PER_IMAGE
+    prompt_tokens = (
+        MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET
+        + MOLMOACT2_TASK_TOKEN_BUDGET
+        + state_dim
+        + MOLMOACT2_SEQUENCE_LENGTH_MARGIN
+    )
+    action_tokens = 0
+    if include_discrete_action:
+        action_tokens_per_step = max(
+            MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP,
+            math.ceil(action_dim * MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM),
+        )
+        action_tokens = MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS + action_horizon * action_tokens_per_step
+
+    return _round_up(
+        image_tokens + prompt_tokens + action_tokens,
+        MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE,
+    )
+

@PreTrainedConfig.register_subclass("molmoact2")
@dataclass
@@ -106,7 +255,7 @@ class MolmoAct2Config(PreTrainedConfig):
    optimizer_grad_clip_norm: float = 1.0

    scheduler_warmup_steps: int = 200
-    scheduler_decay_steps: int = 100_000
+    scheduler_decay_steps: int | None = None
    scheduler_decay_lr: float = 1e-6

    normalization_mapping: dict[str, NormalizationMode] = field(
@@ -184,6 +333,41 @@ class MolmoAct2Config(PreTrainedConfig):
        if self.max_sequence_length is not None and self.max_sequence_length < 1:
            raise ValueError(f"max_sequence_length must be >= 1 or None, got {self.max_sequence_length}.")

+    def inferred_max_sequence_length(
+        self,
+        *,
+        num_images: int | None = None,
+        state_dim: int | None = None,
+        action_dim: int | None = None,
+        action_horizon: int | None = None,
+        include_discrete_action: bool | None = None,
+    ) -> int:
+        if self.max_sequence_length is not None:
+            return int(self.max_sequence_length)
+
+        if num_images is None:
+            num_images = len(self.image_keys) or len(self.image_features) or MOLMOACT2_DEFAULT_NUM_IMAGES
+        if state_dim is None:
+            state_feature = self.robot_state_feature
+            state_dim = int(state_feature.shape[0]) if state_feature is not None else 0
+        if action_dim is None:
+            action_feature = self.action_feature
+            action_dim = (
+                int(action_feature.shape[0]) if action_feature is not None else self.expected_max_action_dim
+            )
+        if action_horizon is None:
+            action_horizon = self.chunk_size
+        if include_discrete_action is None:
+            include_discrete_action = self.action_mode in {"discrete", "both"}
+
+        return infer_molmoact2_max_sequence_length(
+            num_images=int(num_images),
+            state_dim=int(state_dim),
+            action_dim=int(action_dim),
+            action_horizon=int(action_horizon),
+            include_discrete_action=bool(include_discrete_action),
+        )
+
    @property
    def observation_delta_indices(self) -> None:
        return None
@@ -206,7 +390,7 @@ class MolmoAct2Config(PreTrainedConfig):
        )

    def get_scheduler_preset(self) -> LRSchedulerConfig | None:
-        return CosineDecayWithWarmupSchedulerConfig(
+        return MolmoAct2CosineDecayWithWarmupSchedulerConfig(
            peak_lr=self.optimizer_lr,
            decay_lr=self.scheduler_decay_lr,
            num_warmup_steps=self.scheduler_warmup_steps,
@@ -242,3 +426,94 @@ class MolmoAct2Config(PreTrainedConfig):
                shape=(self.expected_max_action_dim,),
            )
            self.output_features[ACTION] = action_feature
+
+    def apply_norm_tag_metadata(self) -> None:
+        if not str(self.norm_tag or "").strip():
+            return
+        metadata = _load_hf_norm_metadata_for_tag(
+            self.checkpoint_path,
+            revision=self.checkpoint_revision,
+            force_download=bool(self.checkpoint_force_download),
+            norm_tag=self.norm_tag,
+        )
+        if metadata.get("action_horizon") is not None:
+            self.chunk_size = int(metadata["action_horizon"])
+        if metadata.get("n_action_steps") is not None:
+            self.n_action_steps = int(metadata["n_action_steps"])
+        if not self.setup_type and metadata.get("setup_type") is not None:
+            self.setup_type = str(metadata["setup_type"])
+        if not self.control_mode and metadata.get("control_mode") is not None:
+            self.control_mode = str(metadata["control_mode"])
+
+    def saved_policy_action_mode(self) -> str | None:
+        pretrained_path = getattr(self, "pretrained_path", None)
+        if pretrained_path is None:
+            return None
+        config_path = Path(pretrained_path) / "config.json"
+        if not config_path.exists():
+            return None
+        try:
+            mode = json.loads(config_path.read_text()).get("action_mode")
+        except (OSError, json.JSONDecodeError):
+            return None
+        if mode in {"continuous", "discrete", "both"}:
+            return str(mode)
+        return None
+
+    def training_action_mode(self, saved_policy_action_mode: str | None = None) -> str:
+        return saved_policy_action_mode or self.action_mode
+
+    def validate_inference_action_mode(self, saved_policy_action_mode: str | None = None) -> None:
+        requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            return
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='discrete' and cannot run "
+                "continuous inference."
+            )
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='continuous' and cannot run "
+                "discrete inference. Train with action_mode='both' or action_mode='discrete' first."
+            )
+
+    def validate_checkpoint_action_mode(
+        self,
+        checkpoint_action_mode: str,
+        *,
+        has_action_expert: bool,
+    ) -> None:
+        if self.action_mode == "both" and checkpoint_action_mode != "both":
+            raise ValueError(
+                f"action_mode='both' requires checkpoint action_mode='both', got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode == "discrete" and checkpoint_action_mode not in {"discrete", "both"}:
+            raise ValueError(
+                f"action_mode='discrete' requires checkpoint action_mode in {{'discrete', 'both'}}, "
+                f"got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode in {"continuous", "both"} and not has_action_expert:
+            raise ValueError("Continuous MolmoAct2 training requires an action expert checkpoint.")
+
+    def resolve_inference_action_mode(
+        self,
+        requested_mode: str | None,
+        saved_policy_action_mode: str | None = None,
+    ) -> str:
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode is None:
+            requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            raise ValueError(
+                "MolmoAct2 inference requires `inference_action_mode` to be set explicitly "
+                "to either 'continuous' or 'discrete'."
+            )
+        if requested_mode not in {"continuous", "discrete"}:
+            raise ValueError("MolmoAct2 inference_action_mode must be either 'continuous' or 'discrete'.")
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError("MolmoAct2 action_mode='discrete' checkpoint cannot run continuous inference.")
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError("MolmoAct2 action_mode='continuous' checkpoint cannot run discrete inference.")
+        return requested_mode
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,3 +13,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+# ruff: noqa
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,19 +14,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa
+
 import logging
+import os
 from pathlib import Path
 from typing import ClassVar

 import numpy as np
 from tokenizers import ByteLevelBPETokenizer
 from tokenizers.trainers import BpeTrainer
+from huggingface_hub import snapshot_download
 from transformers import PreTrainedTokenizerFast
 from transformers.processing_utils import ProcessorMixin

-from ..modeling_molmoact2 import _hf_token

-logger = logging.getLogger(__name__)
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")


 def _resolve_tokenizer_location(
@@ -36,8 +42,6 @@ def _resolve_tokenizer_location(
    local_path = Path(str(tokenizer_path)).expanduser()
    if local_path.exists():
        return str(local_path)
-    from huggingface_hub import snapshot_download
-
    return snapshot_download(
        repo_id=str(tokenizer_path),
        repo_type="model",
@@ -130,8 +134,9 @@ class UniversalActionProcessor(ProcessorMixin):
                ), (
                    f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
                )
-            except Exception:
-                logger.warning("Error decoding tokens: %s", token, exc_info=True)
+            except Exception as e:
+                print(f"Error decoding tokens: {e}")
+                print(f"Tokens: {token}")
                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
            decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
        return np.stack(decoded_actions)
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,12 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """
 MolmoAct2 configuration
 """

-from typing import Any
+from typing import Optional, Any

 from transformers import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,28 +14,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """Image processor class for MolmoAct2"""

-import einops
+from typing import Optional, Union
 import numpy as np
+import einops
 import torch
 import torchvision.transforms
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
-from transformers.image_transforms import convert_to_rgb
+
 from transformers.image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ImageInput,
    PILImageResampling,
    make_flat_list_of_images,
-    to_numpy_array,
    valid_images,
+    to_numpy_array,
 )
+from transformers.image_transforms import convert_to_rgb
 from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
 from transformers.utils import TensorType, logging

+
 logger = logging.get_logger(__name__)


@@ -66,8 +73,8 @@ def resize_image(
        )(image)
        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
    else:
-        assert image.dtype == torch.uint8, (
-            f"SigLIP expects float images or uint8 images, but got {image.dtype}"
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
        )
        in_min = 0.0
        in_max = 255.0
@@ -89,6 +96,7 @@ def resize_image(
 def select_tiling(h, w, patch_size, max_num_crops):
    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
    tilings = []
    for i in range(1, max_num_crops + 1):
        for j in range(1, max_num_crops + 1):
@@ -398,17 +406,13 @@ class MolmoAct2ImageProcessor(BaseImageProcessor):
        image_std: float | list[float] | None = None,
        do_convert_rgb: bool = True,
        max_crops: int = 8,
-        overlap_margins: list[int] | None = None,
+        overlap_margins: list[int] = [4, 4],
        crop_mode: str = "overlap-and-resize-c2",
        patch_size: int = 14,
-        pooling_size: list[int] | None = None,
+        pooling_size: list[int] = [2, 2],
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
-        if overlap_margins is None:
-            overlap_margins = [4, 4]
-        if pooling_size is None:
-            pooling_size = [2, 2]
        size = size if size is not None else {"height": 378, "width": 378}
        size = get_size_dict(size, default_to_square=True)
        self.size = size
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,15 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """Inference utilities for MolmoAct2"""

-from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Optional, Tuple
+from collections.abc import Iterable, Sequence

 import torch
-from torch.nn import functional as F  # noqa: N812
+from torch.nn import functional as F
 from transformers.cache_utils import Cache
 from transformers.configuration_utils import PretrainedConfig

@@ -676,7 +679,7 @@ def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:


 def _copy_context_(dst: Any, src: Any) -> None:
-    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts, strict=False):
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
        dst_k.copy_(src_k)
        dst_v.copy_(src_v)
    if src.cross_mask is not None:
@@ -686,7 +689,7 @@ def _copy_context_(dst: Any, src: Any) -> None:
    if src.valid_action is not None:
        dst.valid_action.copy_(src.valid_action)
    if src.rope_cache is not None:
-        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache, strict=False):
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
            dst_tensor.copy_(src_tensor)


@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,25 +14,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """Modeling code for MolmoAct2"""

-# ruff: noqa: N806
-
 import json
 import math
 import os
 import re
-from collections.abc import Callable, Mapping, Sequence
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
+from collections.abc import Callable, Mapping, Sequence

 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import functional as F  # noqa: N812
+from torch.nn import functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
@@ -646,7 +647,7 @@ class ActionExpert(nn.Module):
                f"got {len(encoder_kv_states)}."
            )
        kv_contexts = []
-        for block, (k_in, v_in) in zip(self.blocks, encoder_kv_states, strict=False):
+        for block, (k_in, v_in) in zip(self.blocks, encoder_kv_states):
            k_ctx = self._project_kv_tensor(k_in, self.context_k_proj)
            v_ctx = self._project_kv_tensor(v_in, self.context_v_proj)
            k_norm = block.cross_attn.k_norm
@@ -731,7 +732,7 @@ class ActionExpert(nn.Module):
        timesteps: Sequence[torch.Tensor],
    ) -> Sequence[ActionExpertStepModulation]:
        cache = []
-        for _idx, step_t in enumerate(timesteps):
+        for idx, step_t in enumerate(timesteps):
            conditioning = self._time_conditioning(step_t)
            block_modulations = []
            for block in self.blocks:
@@ -785,8 +786,8 @@ class ActionExpert(nn.Module):
        x = self.action_embed(actions)
        if context.valid_action is not None:
            x = x * context.valid_action
-        for _idx, (block, kv_context, block_modulation) in enumerate(
-            zip(self.blocks, context.kv_contexts, block_modulations, strict=False)
+        for idx, (block, kv_context, block_modulation) in enumerate(
+            zip(self.blocks, context.kv_contexts, block_modulations)
        ):
            x = block(
                x,
@@ -2873,7 +2874,7 @@ class MolmoAct2Model(MolmoAct2PreTrainedModel):
                    depth_mask=depth_mask,
                    encoder_attention_mask=encoder_attention_mask,
                )
-                for gate, source in zip(gate_head, sources, strict=False)
+                for gate, source in zip(gate_head, sources)
            ]
            return gates, depth_mask
        gate = self._depth_gate_from_source(
@@ -4457,7 +4458,7 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
        ```python
        >>> from PIL import Image
        >>> import requests
-        >>> from lerobot.policies.molmoact2.molmoact2_hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
+        >>> from lerobot.policies.molmoact2.hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
        >>> from lerobot.policies.molmoact2.processor_molmoact2 import _load_local_molmoact2_processor

        >>> model = MolmoAct2ForConditionalGeneration.from_pretrained("...")
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,39 +14,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """
 Processor class for MolmoAct2.
 """

+from typing import Optional, Union
+import dataclasses
+
 import numpy as np
-from transformers import AutoTokenizer
-from transformers.feature_extraction_utils import BatchFeature
+
 from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
 from transformers.processing_utils import (
+    Unpack,
    ProcessingKwargs,
    ProcessorMixin,
-    Unpack,
 )
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
 from transformers.utils import logging
-from transformers.video_utils import VideoInput

-from .image_processing_molmoact2 import MolmoAct2ImageProcessor, MolmoAct2ImagesKwargs
-from .video_processing_molmoact2 import MolmoAct2VideoProcessor, MolmoAct2VideoProcessorKwargs
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+

 logger = logging.get_logger(__name__)


 # Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
-IMAGE_PATCH_TOKEN = "<im_patch>"  # nosec B105  # Where to insert high-res tokens
-IMAGE_LOW_RES_TOKEN = "<im_low>"  # nosec B105  # Where to insert low-res tokens
-IM_START_TOKEN = "<im_start>"  # nosec B105
-LOW_RES_IMAGE_START_TOKEN = "<low_res_im_start>"  # nosec B105
-FRAME_START_TOKEN = "<frame_start>"  # nosec B105
-IM_END_TOKEN = "<im_end>"  # nosec B105
-FRAME_END_TOKEN = "<frame_end>"  # nosec B105
-IM_COL_TOKEN = "<im_col>"  # nosec B105
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN = f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
 IMAGE_PROMPT = "<|image|>"
 VIDEO_PROMPT = "<|video|>"

@@ -216,7 +224,7 @@ class MolmoAct2Processor(ProcessorMixin):
            input_ids = input_ids[None, :]
            attention_mask = attention_mask[None, :]

-        B, S = input_ids.shape  # noqa: N806
+        B, S = input_ids.shape

        # Handle zero-length sequence
        if S == 0:
@@ -356,7 +364,7 @@ class MolmoAct2Processor(ProcessorMixin):
                assert num_videos in {0, 1}, "At most one video is supported for now"
                video_grids_i = video_grids[index : index + num_videos]
                metadata_i = video_metadata[index : index + num_videos]
-                for video_grid, metadata in zip(video_grids_i, metadata_i, strict=False):
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
                    video_string = self.get_video_string(
                        video_grid,
                        metadata.timestamps,
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,23 +14,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# ruff: noqa

 """Video processor class for MolmoAct2"""

+from functools import partial
 import os
 import warnings
-from collections.abc import Callable
 from contextlib import redirect_stdout
-from functools import partial
 from io import BytesIO
 from urllib.parse import urlparse
+from typing import Optional, Union
+from collections.abc import Callable

-import einops
 import numpy as np
 import requests
+import einops
 import torch
 import torchvision.transforms
-from transformers.feature_extraction_utils import BatchFeature
+
 from transformers.image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
@@ -37,24 +41,27 @@ from transformers.image_utils import (
    SizeDict,
    validate_kwargs,
 )
+from transformers.video_utils import (
+    VideoInput,
+    is_valid_video,
+    make_batched_videos,
+    make_batched_metadata,
+    VideoMetadata,
+)
 from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.video_processing_utils import BaseVideoProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
 from transformers.utils import (
-    TensorType,
    is_av_available,
    is_decord_available,
    is_torchcodec_available,
    is_yt_dlp_available,
+    TensorType,
    logging,
    to_numpy,
 )
-from transformers.video_processing_utils import BaseVideoProcessor
-from transformers.video_utils import (
-    VideoInput,
-    VideoMetadata,
-    is_valid_video,
-    make_batched_metadata,
-    make_batched_videos,
-)
+

 logger = logging.get_logger(__name__)

@@ -95,8 +102,8 @@ def resize_image(
        )(image)
        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
    else:
-        assert image.dtype == torch.uint8, (
-            f"SigLIP expects float images or uint8 images, but got {image.dtype}"
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
        )
        in_min = 0.0
        in_max = 255.0
@@ -541,8 +548,9 @@ def get_target_fps(
        step_size = max(int(video_fps / target_fps), 1)
        num_frames_sampled_at_fps = int(total_frames / step_size)
        if num_frames_sampled == 0:
-            if "uniform" in frame_sample_mode and num_frames_sampled_at_fps > max_frames:
-                break
+            if "uniform" in frame_sample_mode:
+                if num_frames_sampled_at_fps > max_frames:
+                    break
            selected_target_fps = target_fps
            num_frames_sampled = num_frames_sampled_at_fps

@@ -771,15 +779,13 @@ class MolmoAct2VideoProcessor(BaseVideoProcessor):
        elif is_torchcodec_available():
            warnings.warn(
                "`decord` is not installed and cannot be used to decode the video by default. "
-                "Falling back to `torchcodec`.",
-                stacklevel=2,
+                "Falling back to `torchcodec`."
            )
            backend = "torchcodec"
        else:
            warnings.warn(
                "`decord` is not installed and cannot be used to decode the video by default. "
-                "Falling back to `PyAV`.",
-                stacklevel=2,
+                "Falling back to `PyAV`."
            )
            backend = "pyav"

@@ -789,8 +795,7 @@ class MolmoAct2VideoProcessor(BaseVideoProcessor):
                    *[
                        self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn)
                        for x in video_url_or_urls
-                    ],
-                    strict=False,
+                    ]
                )
            )
        else:
@@ -816,7 +821,7 @@ class MolmoAct2VideoProcessor(BaseVideoProcessor):
            assert video_metadata[0].fps is not None, "FPS must be provided for video input"
            sampled_videos = []
            sampled_metadata = []
-            for video, metadata in zip(videos, video_metadata, strict=False):
+            for video, metadata in zip(videos, video_metadata):
                indices = sample_indices_fn(metadata=metadata)
                metadata.frames_indices = indices
                sampled_videos.append(video[indices])
@@ -980,11 +985,11 @@ class MolmoAct2VideoProcessor(BaseVideoProcessor):
        pixel_values_videos = np.concatenate(batch_crops, 0)
        video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)

-        data = {
-            "pixel_values_videos": pixel_values_videos,
-            "video_token_pooling": video_token_pooling,
-            "video_grids": video_grids,
-        }
+        data = dict(
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )

        return BatchFeature(data, tensor_type=return_tensors)

@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,22 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""MolmoAct2 policy for LeRobot.
-
-MolmoAct2 is a VLM-based robotics policy from Allen AI that combines a
-Molmo vision-language backbone with a per-layer flow-matching action expert
-for continuous action generation, plus an optional discrete action token
-head. This module wraps the vendored HF model implementation
-(``molmoact2_hf_model/``) into the LeRobot ``PreTrainedPolicy`` interface.
-
-Paper:  https://allenai.org/blog/molmoact2
-Code:   https://github.com/allenai/molmoact2
-"""
-
 from __future__ import annotations

 import json
-import logging
 import os
 import types
 from collections import deque
@@ -46,58 +35,13 @@ from lerobot.utils.constants import ACTION
 from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package

 from ..rtc.modeling_rtc import RTCProcessor
-from .configuration_molmoact2 import MolmoAct2Config
-
-logger = logging.getLogger(__name__)
-
-
-def _hf_token() -> str | None:
-    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
-
-
-def _resolve_checkpoint_location(
-    checkpoint_path: str,
-    *,
-    revision: str | None = None,
-    force_download: bool = False,
-) -> str:
-    """Resolve a checkpoint path to a local directory, downloading from Hub if needed."""
-    checkpoint_path = str(checkpoint_path or "").strip()
-    if not checkpoint_path:
-        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
-    from pathlib import Path
-
-    local_path = Path(checkpoint_path).expanduser()
-    if local_path.exists():
-        return str(local_path)
-    from huggingface_hub import snapshot_download
-
-    return snapshot_download(
-        repo_id=checkpoint_path,
-        repo_type="model",
-        revision=revision,
-        force_download=force_download,
-        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
-        token=_hf_token(),
-    )
-
-
-def _torch_dtype(dtype: str) -> torch.dtype:
-    """Convert a dtype name string to a torch.dtype."""
-    if dtype == "float32":
-        return torch.float32
-    if dtype == "bfloat16":
-        return torch.bfloat16
-    if dtype == "float16":
-        return torch.float16
-    raise ValueError(f"Unsupported dtype: {dtype}")
-
+from .configuration_molmoact2 import MolmoAct2Config, _hf_token, _resolve_checkpoint_location

 if TYPE_CHECKING or _transformers_available:
    from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME

-    from .molmoact2_hf_model.configuration_molmoact2 import MolmoAct2Config as HFMolmoAct2Config
-    from .molmoact2_hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
+    from .hf_model.configuration_molmoact2 import MolmoAct2Config as HFMolmoAct2Config
+    from .hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
 else:
    SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
    SAFE_WEIGHTS_NAME = "model.safetensors"
@@ -105,7 +49,7 @@ else:
    MolmoAct2ForConditionalGeneration = None

 if TYPE_CHECKING or (_transformers_available and _scipy_available):
-    from .molmoact2_hf_model.action_tokenizer import UniversalActionProcessor
+    from .hf_model.action_tokenizer import UniversalActionProcessor
 else:
    UniversalActionProcessor = None

@@ -126,156 +70,6 @@ _MODEL_INPUT_KEYS = {
 }


-def _load_hf_norm_metadata_for_tag(
-    checkpoint_path: str,
-    *,
-    revision: str | None,
-    force_download: bool,
-    norm_tag: str | None,
-) -> dict[str, Any]:
-    """Read per-tag metadata from the checkpoint's ``norm_stats.json``."""
-    norm_tag = str(norm_tag or "").strip()
-    if not norm_tag:
-        return {}
-    from contextlib import suppress
-    from pathlib import Path
-
-    checkpoint_location = Path(
-        _resolve_checkpoint_location(
-            checkpoint_path,
-            revision=revision,
-            force_download=force_download,
-        )
-    )
-    norm_stats_filename = "norm_stats.json"
-    config_path = checkpoint_location / "config.json"
-    if config_path.exists():
-        with suppress(OSError, json.JSONDecodeError):
-            norm_stats_filename = str(
-                json.loads(config_path.read_text()).get("norm_stats_filename") or norm_stats_filename
-            )
-    stats_path = checkpoint_location / norm_stats_filename
-    if not stats_path.exists():
-        raise FileNotFoundError(
-            f"MolmoAct2 HF checkpoint is missing {norm_stats_filename!r}; cannot resolve norm_tag={norm_tag!r}."
-        )
-    payload = json.loads(stats_path.read_text())
-    metadata_by_tag = payload.get("metadata_by_tag")
-    if not isinstance(metadata_by_tag, dict):
-        raise ValueError(f"MolmoAct2 norm stats file {stats_path} has no metadata_by_tag mapping.")
-    metadata = metadata_by_tag.get(norm_tag)
-    if not isinstance(metadata, dict):
-        available = sorted(str(tag) for tag in metadata_by_tag)
-        raise ValueError(f"Unknown MolmoAct2 norm_tag={norm_tag!r}. Available tags: {available}.")
-    return metadata
-
-
-def _apply_norm_tag_metadata(config: MolmoAct2Config) -> None:
-    """Populate config fields from the checkpoint's norm-tag metadata."""
-    if not str(config.norm_tag or "").strip():
-        return
-    metadata = _load_hf_norm_metadata_for_tag(
-        config.checkpoint_path,
-        revision=config.checkpoint_revision,
-        force_download=bool(config.checkpoint_force_download),
-        norm_tag=config.norm_tag,
-    )
-    if metadata.get("action_horizon") is not None:
-        config.chunk_size = int(metadata["action_horizon"])
-    if metadata.get("n_action_steps") is not None:
-        config.n_action_steps = int(metadata["n_action_steps"])
-    if not config.setup_type and metadata.get("setup_type") is not None:
-        config.setup_type = str(metadata["setup_type"])
-    if not config.control_mode and metadata.get("control_mode") is not None:
-        config.control_mode = str(metadata["control_mode"])
-
-
-def _saved_policy_action_mode(config: MolmoAct2Config) -> str | None:
-    """Read the action mode from a LeRobot-saved checkpoint's ``config.json``."""
-    from pathlib import Path
-
-    pretrained_path = getattr(config, "pretrained_path", None)
-    if pretrained_path is None:
-        return None
-    config_path = Path(pretrained_path) / "config.json"
-    if not config_path.exists():
-        return None
-    try:
-        mode = json.loads(config_path.read_text()).get("action_mode")
-    except (OSError, json.JSONDecodeError):
-        return None
-    if mode in {"continuous", "discrete", "both"}:
-        return str(mode)
-    return None
-
-
-def _training_action_mode(config: MolmoAct2Config, saved_policy_action_mode: str | None = None) -> str:
-    return saved_policy_action_mode or config.action_mode
-
-
-def _validate_inference_action_mode(
-    config: MolmoAct2Config, saved_policy_action_mode: str | None = None
-) -> None:
-    """Check that the requested inference mode is compatible with the training mode."""
-    requested_mode = config.inference_action_mode
-    if requested_mode is None:
-        return
-    training_mode = _training_action_mode(config, saved_policy_action_mode)
-    if requested_mode == "continuous" and training_mode == "discrete":
-        raise ValueError(
-            "MolmoAct2 checkpoint was trained with action_mode='discrete' and cannot run "
-            "continuous inference."
-        )
-    if requested_mode == "discrete" and training_mode == "continuous":
-        raise ValueError(
-            "MolmoAct2 checkpoint was trained with action_mode='continuous' and cannot run "
-            "discrete inference. Train with action_mode='both' or action_mode='discrete' first."
-        )
-
-
-def _validate_checkpoint_action_mode(
-    config: MolmoAct2Config,
-    checkpoint_action_mode: str,
-    *,
-    has_action_expert: bool,
-) -> None:
-    """Check that the checkpoint's action mode is compatible with the config."""
-    if config.action_mode == "both" and checkpoint_action_mode != "both":
-        raise ValueError(
-            f"action_mode='both' requires checkpoint action_mode='both', got {checkpoint_action_mode!r}."
-        )
-    if config.action_mode == "discrete" and checkpoint_action_mode not in {"discrete", "both"}:
-        raise ValueError(
-            f"action_mode='discrete' requires checkpoint action_mode in {{'discrete', 'both'}}, "
-            f"got {checkpoint_action_mode!r}."
-        )
-    if config.action_mode in {"continuous", "both"} and not has_action_expert:
-        raise ValueError("Continuous MolmoAct2 training requires an action expert checkpoint.")
-
-
-def _resolve_inference_action_mode(
-    config: MolmoAct2Config,
-    requested_mode: str | None,
-    saved_policy_action_mode: str | None = None,
-) -> str:
-    """Resolve the final inference action mode, validating compatibility."""
-    training_mode = _training_action_mode(config, saved_policy_action_mode)
-    if requested_mode is None:
-        requested_mode = config.inference_action_mode
-    if requested_mode is None:
-        raise ValueError(
-            "MolmoAct2 inference requires `inference_action_mode` to be set explicitly "
-            "to either 'continuous' or 'discrete'."
-        )
-    if requested_mode not in {"continuous", "discrete"}:
-        raise ValueError("MolmoAct2 inference_action_mode must be either 'continuous' or 'discrete'.")
-    if requested_mode == "continuous" and training_mode == "discrete":
-        raise ValueError("MolmoAct2 action_mode='discrete' checkpoint cannot run continuous inference.")
-    if requested_mode == "discrete" and training_mode == "continuous":
-        raise ValueError("MolmoAct2 action_mode='continuous' checkpoint cannot run discrete inference.")
-    return requested_mode
-
-
 def _strict_load_safetensors_weights(model: torch.nn.Module, checkpoint_location: str) -> None:
    index_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_INDEX_NAME)
    single_file_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_NAME)
@@ -309,6 +103,16 @@ def _strict_load_safetensors_weights(model: torch.nn.Module, checkpoint_location
    )


+def _torch_dtype(dtype: str) -> torch.dtype:
+    if dtype == "float32":
+        return torch.float32
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    if dtype == "float16":
+        return torch.float16
+    raise ValueError(f"Unsupported dtype: {dtype}")
+
+
 def _sample_beta_timesteps(
    *,
    batch_size: int,
@@ -332,180 +136,7 @@ def _sample_beta_timesteps(
    return time_offset + scale * samples


-def _mask_discrete_action_spans(
-    *,
-    input_ids: Tensor,
-    mask: Tensor,
-    start_token_id: int | None,
-    end_token_id: int | None,
-) -> Tensor:
-    if start_token_id is None or end_token_id is None:
-        return mask
-    mask = mask.clone()
-    for batch_idx in range(input_ids.shape[0]):
-        row = input_ids[batch_idx]
-        starts = (row == int(start_token_id)).nonzero(as_tuple=False).flatten().tolist()
-        ends = (row == int(end_token_id)).nonzero(as_tuple=False).flatten().tolist()
-        end_ptr = 0
-        for start in starts:
-            while end_ptr < len(ends) and ends[end_ptr] < start:
-                end_ptr += 1
-            if end_ptr >= len(ends):
-                mask[batch_idx, start:] = False
-                break
-            end = int(ends[end_ptr])
-            mask[batch_idx, start : end + 1] = False
-            end_ptr += 1
-    return mask
-
-
-def _drop_trivial_attention_mask(model_inputs: dict[str, Tensor]) -> dict[str, Tensor]:
-    attention_mask = model_inputs.get("attention_mask")
-    if torch.is_tensor(attention_mask) and bool(attention_mask.to(dtype=torch.bool).all().item()):
-        model_inputs = dict(model_inputs)
-        model_inputs.pop("attention_mask", None)
-    return model_inputs
-
-
-def _expand_mask(mask: Tensor | None, num_flow_timesteps: int) -> Tensor | None:
-    if mask is None:
-        return None
-    return (
-        mask.unsqueeze(1)
-        .expand(-1, num_flow_timesteps, *([-1] * (mask.ndim - 1)))
-        .reshape(mask.shape[0] * num_flow_timesteps, *mask.shape[1:])
-    )
-
-
-def _action_dim_valid_mask(target: Tensor, action_dim_is_pad: Tensor | None) -> Tensor | None:
-    if action_dim_is_pad is None:
-        return None
-    mask = ~action_dim_is_pad.to(device=target.device, dtype=torch.bool)
-    if mask.ndim == 1:
-        mask = mask.unsqueeze(0)
-    if mask.shape[-1] != target.shape[-1]:
-        raise ValueError(
-            f"action_dim_is_pad width {mask.shape[-1]} does not match target width {target.shape[-1]}."
-        )
-    if mask.shape[0] == 1 and target.shape[0] != 1:
-        mask = mask.expand(target.shape[0], -1)
-    if mask.shape[0] != target.shape[0]:
-        raise ValueError(
-            f"action_dim_is_pad batch {mask.shape[0]} does not match target batch {target.shape[0]}."
-        )
-    while mask.ndim < target.ndim:
-        mask = mask.unsqueeze(1)
-    return mask
-
-
-def _mask_action_dim_tensor(tensor: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
-    if action_dim_is_pad is None:
-        return tensor
-    valid_mask = _action_dim_valid_mask(tensor, action_dim_is_pad)
-    if valid_mask is None:
-        return tensor
-    return tensor.masked_fill(~valid_mask, 0)
-
-
-def _apply_action_dim_padding_mask(loss: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
-    valid_mask = _action_dim_valid_mask(loss, action_dim_is_pad)
-    if valid_mask is None:
-        return loss
-    valid = valid_mask.to(dtype=loss.dtype)
-    denom = valid.sum(dim=-1).clamp_min(1.0)
-    return (loss * valid).sum(dim=-1) / denom
-
-
-def _apply_action_chunk_padding_mask(loss: Tensor, action_horizon_is_pad: Tensor | None) -> Tensor:
-    if action_horizon_is_pad is None:
-        return loss
-    valid_action = (
-        (~action_horizon_is_pad.to(device=loss.device, dtype=torch.bool)).unsqueeze(1).unsqueeze(-1)
-    )
-    return loss * valid_action
-
-
-def _combine_rollout_seeds(first_seed: int, batch_size: int) -> int:
-    seed = 0
-    for idx in range(batch_size):
-        seed = (seed + (idx + 1) * (first_seed + idx)) % (2**63 - 1)
-    return seed
-
-
-def _rollout_task_signature(batch: dict[str, Any]) -> tuple[Any, ...] | None:
-    task = batch.get("task")
-    if task is None:
-        task = batch.get("observation.language")
-    if task is None:
-        return None
-    if isinstance(task, str):
-        return (task,)
-    if isinstance(task, (list, tuple)):
-        return tuple(str(item) for item in task)
-    return (str(task),)
-
-
-def _extract_discrete_token_bins(
-    generated_ids: list[int],
-    start_token_id: int,
-    end_token_id: int,
-    token_id_to_bin: dict[int, int],
-) -> list[int]:
-    start_idx = None
-    end_idx = None
-    for idx, token_id in enumerate(generated_ids):
-        if token_id == start_token_id:
-            start_idx = idx
-            break
-    if start_idx is not None:
-        for idx in range(start_idx + 1, len(generated_ids)):
-            if generated_ids[idx] == end_token_id:
-                end_idx = idx
-                break
-    span_start = 0 if start_idx is None else start_idx + 1
-    span_end = len(generated_ids) if end_idx is None else end_idx
-    return [
-        int(token_id_to_bin[token_id])
-        for token_id in generated_ids[span_start:span_end]
-        if token_id in token_id_to_bin
-    ]
-
-
-def _weighted_mean(values: Tensor, weights: Tensor | None) -> Tensor:
-    if weights is None:
-        return values.mean()
-    weights = weights.to(device=values.device, dtype=values.dtype)
-    return torch.dot(values, weights) / weights.sum().clamp_min(1.0)
-
-
-def _weighted_per_example(
-    values: Tensor,
-    weights: Tensor | None,
-    example_indices: Tensor,
-    batch_size: int,
-) -> Tensor:
-    values = values.float()
-    if weights is None:
-        weights = torch.ones_like(values)
-    else:
-        weights = weights.to(device=values.device, dtype=values.dtype)
-    loss_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
-    weight_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
-    loss_sum.scatter_add_(0, example_indices, values * weights)
-    weight_sum.scatter_add_(0, example_indices, weights)
-    global_weight_sum = weight_sum.sum().clamp_min(1.0)
-    return loss_sum * float(batch_size) / global_weight_sum
-
-
 class MolmoAct2Policy(PreTrainedPolicy):
-    """MolmoAct2 policy wrapping the vendored HF model for LeRobot.
-
-    Supports three training modes via ``config.action_mode``:
-    ``"continuous"`` (flow-matching only), ``"discrete"`` (autoregressive
-    token prediction only), or ``"both"`` (joint loss). At inference,
-    ``config.inference_action_mode`` selects which head generates actions.
-    """
-
    config_class = MolmoAct2Config
    name = "molmoact2"

@@ -518,10 +149,10 @@ class MolmoAct2Policy(PreTrainedPolicy):
        **kwargs,
    ):
        super().__init__(config, *inputs, **kwargs)
-        _apply_norm_tag_metadata(self.config)
+        self.config.apply_norm_tag_metadata()
        self.config.validate_features()
        del inputs, kwargs, dataset_stats, dataset_meta
-        self._checkpoint_action_mode = _saved_policy_action_mode(self.config)
+        self._checkpoint_action_mode = self.config.saved_policy_action_mode()
        self._action_queue: deque[Tensor] = deque(maxlen=self.config.n_action_steps)
        self._rollout_action_generator: torch.Generator | None = None
        self._rollout_task_key: tuple[Any, ...] | None = None
@@ -529,7 +160,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
        self.rtc_processor: RTCProcessor | None = None
        self.action_tokenizer: Any | None = None
        self._load_hf_model()
-        _validate_inference_action_mode(self.config, self._checkpoint_action_mode)
+        self.config.validate_inference_action_mode(self._checkpoint_action_mode)
        if self.config.enable_lora_vlm:
            self._apply_lora_adapters()
        self.init_rtc_processor()
@@ -581,8 +212,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
                "`policy.checkpoint_force_download=true` after the updated files are pushed."
            )
        checkpoint_action_mode = str(self.model.config.action_mode)
-        _validate_checkpoint_action_mode(
-            self.config,
+        self.config.validate_checkpoint_action_mode(
            checkpoint_action_mode,
            has_action_expert=bool(getattr(self.model.config, "add_action_expert", False)),
        )
@@ -596,7 +226,6 @@ class MolmoAct2Policy(PreTrainedPolicy):
        self.train(self.training)

    def reset(self) -> None:
-        """Clear the action queue and rollout generator between episodes."""
        self._action_queue = deque(maxlen=self.config.n_action_steps)
        self._rollout_action_generator = None

@@ -705,7 +334,6 @@ class MolmoAct2Policy(PreTrainedPolicy):
            param.requires_grad = False

    def get_optim_params(self) -> list[dict[str, Any]]:
-        """Return optimizer param groups with per-component learning rates."""
        vit_params: list[Tensor] = []
        connector_params: list[Tensor] = []
        action_expert_params: list[Tensor] = []
@@ -791,6 +419,33 @@ class MolmoAct2Policy(PreTrainedPolicy):
                return int(value)
        raise RuntimeError("MolmoAct2 could not resolve an action generation horizon.")

+    @staticmethod
+    def _mask_discrete_action_spans(
+        *,
+        input_ids: Tensor,
+        mask: Tensor,
+        start_token_id: int | None,
+        end_token_id: int | None,
+    ) -> Tensor:
+        if start_token_id is None or end_token_id is None:
+            return mask
+        mask = mask.clone()
+        for batch_idx in range(input_ids.shape[0]):
+            row = input_ids[batch_idx]
+            starts = (row == int(start_token_id)).nonzero(as_tuple=False).flatten().tolist()
+            ends = (row == int(end_token_id)).nonzero(as_tuple=False).flatten().tolist()
+            end_ptr = 0
+            for start in starts:
+                while end_ptr < len(ends) and ends[end_ptr] < start:
+                    end_ptr += 1
+                if end_ptr >= len(ends):
+                    mask[batch_idx, start:] = False
+                    break
+                end = int(ends[end_ptr])
+                mask[batch_idx, start : end + 1] = False
+                end_ptr += 1
+        return mask
+
    def _encoder_attention_mask_for_action_expert(
        self,
        *,
@@ -815,13 +470,21 @@ class MolmoAct2Policy(PreTrainedPolicy):
        eos_token_id = getattr(self.model.config, "eos_token_id", None)
        if eos_token_id is not None:
            mask &= input_ids != int(eos_token_id)
-        return _mask_discrete_action_spans(
+        return self._mask_discrete_action_spans(
            input_ids=input_ids,
            mask=mask,
            start_token_id=getattr(self.model.config, "action_start_token_id", None),
            end_token_id=getattr(self.model.config, "action_end_token_id", None),
        )

+    @staticmethod
+    def _drop_trivial_attention_mask(model_inputs: dict[str, Tensor]) -> dict[str, Tensor]:
+        attention_mask = model_inputs.get("attention_mask")
+        if torch.is_tensor(attention_mask) and bool(attention_mask.to(dtype=torch.bool).all().item()):
+            model_inputs = dict(model_inputs)
+            model_inputs.pop("attention_mask", None)
+        return model_inputs
+
    def _load_discrete_action_tokenizer(self) -> Any:
        if self.action_tokenizer is None:
            require_package("transformers", extra="molmoact2")
@@ -835,7 +498,27 @@ class MolmoAct2Policy(PreTrainedPolicy):
        return self.action_tokenizer

    def _resolve_inference_action_mode(self, requested_mode: str | None) -> str:
-        return _resolve_inference_action_mode(self.config, requested_mode, self._checkpoint_action_mode)
+        return self.config.resolve_inference_action_mode(requested_mode, self._checkpoint_action_mode)
+
+    @staticmethod
+    def _combine_rollout_seeds(first_seed: int, batch_size: int) -> int:
+        seed = 0
+        for idx in range(batch_size):
+            seed = (seed + (idx + 1) * (first_seed + idx)) % (2**63 - 1)
+        return seed
+
+    @staticmethod
+    def _rollout_task_signature(batch: dict[str, Any]) -> tuple[Any, ...] | None:
+        task = batch.get("task")
+        if task is None:
+            task = batch.get("observation.language")
+        if task is None:
+            return None
+        if isinstance(task, str):
+            return (task,)
+        if isinstance(task, (list, tuple)):
+            return tuple(str(item) for item in task)
+        return (str(task),)

    def _rollout_generator_for_inputs(
        self,
@@ -849,7 +532,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
        if self._rollout_action_generator is not None:
            return self._rollout_action_generator

-        task_signature = _rollout_task_signature(batch)
+        task_signature = self._rollout_task_signature(batch)
        if task_signature != self._rollout_task_key:
            self._rollout_task_key = task_signature
            self._rollout_index_for_task = 0
@@ -862,10 +545,72 @@ class MolmoAct2Policy(PreTrainedPolicy):
            device if device.type == "cuda" and torch.cuda.is_available() else torch.device("cpu")
        )
        generator = torch.Generator(device=generator_device)
-        generator.manual_seed(_combine_rollout_seeds(first_seed, batch_size))
+        generator.manual_seed(self._combine_rollout_seeds(first_seed, batch_size))
        self._rollout_action_generator = generator
        return generator

+    @staticmethod
+    def _expand_mask(mask: Tensor | None, num_flow_timesteps: int) -> Tensor | None:
+        if mask is None:
+            return None
+        return (
+            mask.unsqueeze(1)
+            .expand(-1, num_flow_timesteps, *([-1] * (mask.ndim - 1)))
+            .reshape(mask.shape[0] * num_flow_timesteps, *mask.shape[1:])
+        )
+
+    @staticmethod
+    def _action_dim_valid_mask(target: Tensor, action_dim_is_pad: Tensor | None) -> Tensor | None:
+        if action_dim_is_pad is None:
+            return None
+        mask = ~action_dim_is_pad.to(device=target.device, dtype=torch.bool)
+        if mask.ndim == 1:
+            mask = mask.unsqueeze(0)
+        if mask.shape[-1] != target.shape[-1]:
+            raise ValueError(
+                f"action_dim_is_pad width {mask.shape[-1]} does not match target width {target.shape[-1]}."
+            )
+        if mask.shape[0] == 1 and target.shape[0] != 1:
+            mask = mask.expand(target.shape[0], -1)
+        if mask.shape[0] != target.shape[0]:
+            raise ValueError(
+                f"action_dim_is_pad batch {mask.shape[0]} does not match target batch {target.shape[0]}."
+            )
+        while mask.ndim < target.ndim:
+            mask = mask.unsqueeze(1)
+        return mask
+
+    @classmethod
+    def _mask_action_dim_tensor(cls, tensor: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
+        if not cls._mask_enabled_static(action_dim_is_pad):
+            return tensor
+        valid_mask = cls._action_dim_valid_mask(tensor, action_dim_is_pad)
+        if valid_mask is None:
+            return tensor
+        return tensor.masked_fill(~valid_mask, 0)
+
+    @staticmethod
+    def _mask_enabled_static(action_dim_is_pad: Tensor | None) -> bool:
+        return action_dim_is_pad is not None
+
+    @classmethod
+    def _apply_action_dim_padding_mask(cls, loss: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
+        valid_mask = cls._action_dim_valid_mask(loss, action_dim_is_pad)
+        if valid_mask is None:
+            return loss
+        valid = valid_mask.to(dtype=loss.dtype)
+        denom = valid.sum(dim=-1).clamp_min(1.0)
+        return (loss * valid).sum(dim=-1) / denom
+
+    @staticmethod
+    def _apply_action_chunk_padding_mask(loss: Tensor, action_horizon_is_pad: Tensor | None) -> Tensor:
+        if action_horizon_is_pad is None:
+            return loss
+        valid_action = (
+            (~action_horizon_is_pad.to(device=loss.device, dtype=torch.bool)).unsqueeze(1).unsqueeze(-1)
+        )
+        return loss * valid_action
+
    def _prepare_flow_matching_tensors(
        self,
        *,
@@ -904,7 +649,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
                )

        if self.config.mask_action_dim_padding:
-            actions = _mask_action_dim_tensor(actions, action_dim_is_pad)
+            actions = self._mask_action_dim_tensor(actions, action_dim_is_pad)

        expected_noise_shape = (batch_size, num_flow_timesteps, actions.shape[1], actions.shape[2])
        if noise is None:
@@ -916,7 +661,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
                    f"flow noise must have shape {expected_noise_shape}, got {tuple(noise.shape)}."
                )
        if self.config.mask_action_dim_padding:
-            noise = _mask_action_dim_tensor(noise, action_dim_is_pad)
+            noise = self._mask_action_dim_tensor(noise, action_dim_is_pad)

        t_broadcast = timesteps.view(batch_size, num_flow_timesteps, 1, 1)
        actions_expanded = actions.unsqueeze(1).expand(-1, num_flow_timesteps, -1, -1)
@@ -1044,7 +789,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
        valid_action = None
        if action_attention_mask is not None:
            valid_action = action_attention_mask.to(device=device, dtype=actions.dtype).unsqueeze(-1)
-            valid_action = _expand_mask(valid_action, num_flow_timesteps)
+            valid_action = self._expand_mask(valid_action, num_flow_timesteps)

        rope_cache = None
        if len(action_expert.blocks) > 0 and action_expert.blocks[0].self_attn.rope is not None:
@@ -1059,14 +804,14 @@ class MolmoAct2Policy(PreTrainedPolicy):
            batch_size,
            actions.dtype,
        )
-        cross_mask = _expand_mask(cross_mask, num_flow_timesteps)
+        cross_mask = self._expand_mask(cross_mask, num_flow_timesteps)
        self_mask = action_expert._build_self_attention_mask(
            action_attention_mask,
            actions.shape[1],
            device,
            actions.dtype,
        )
-        self_mask = _expand_mask(self_mask, num_flow_timesteps)
+        self_mask = self._expand_mask(self_mask, num_flow_timesteps)

        conditioning = self._action_time_conditioning(action_expert, timesteps_flat)
        action_hidden = action_expert.action_embed(xt_flat)
@@ -1126,8 +871,8 @@ class MolmoAct2Policy(PreTrainedPolicy):
            if k_norm is not None:
                k_ctx = k_norm(k_ctx.transpose(1, 2)).transpose(1, 2)
            if num_flow_timesteps != 1:
-                k_ctx = _expand_mask(k_ctx, num_flow_timesteps)
-                v_ctx = _expand_mask(v_ctx, num_flow_timesteps)
+                k_ctx = self._expand_mask(k_ctx, num_flow_timesteps)
+                v_ctx = self._expand_mask(v_ctx, num_flow_timesteps)

            next_action_hidden = action_block(
                layer_action_hidden,
@@ -1167,9 +912,9 @@ class MolmoAct2Policy(PreTrainedPolicy):
        )

        loss = F.mse_loss(pred_velocity, target_velocity, reduction="none")
-        loss = _apply_action_chunk_padding_mask(loss, batch.get("action_horizon_is_pad"))
+        loss = self._apply_action_chunk_padding_mask(loss, batch.get("action_horizon_is_pad"))
        if self.config.mask_action_dim_padding:
-            loss = _apply_action_dim_padding_mask(loss, batch.get("action_dim_is_pad"))
+            loss = self._apply_action_dim_padding_mask(loss, batch.get("action_dim_is_pad"))
        loss = loss.reshape(batch_size, -1).mean(dim=1)
        if reduction == "mean":
            loss = loss.mean()
@@ -1188,6 +933,32 @@ class MolmoAct2Policy(PreTrainedPolicy):
        example_weights[nonempty] = 2.0 / torch.sqrt(token_counts[nonempty])
        return example_weights[:, None].expand_as(valid_positions)[valid_positions].to(dtype=torch.float32)

+    @staticmethod
+    def _weighted_mean(values: Tensor, weights: Tensor | None) -> Tensor:
+        if weights is None:
+            return values.mean()
+        weights = weights.to(device=values.device, dtype=values.dtype)
+        return torch.dot(values, weights) / weights.sum().clamp_min(1.0)
+
+    @staticmethod
+    def _weighted_per_example(
+        values: Tensor,
+        weights: Tensor | None,
+        example_indices: Tensor,
+        batch_size: int,
+    ) -> Tensor:
+        values = values.float()
+        if weights is None:
+            weights = torch.ones_like(values)
+        else:
+            weights = weights.to(device=values.device, dtype=values.dtype)
+        loss_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
+        weight_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
+        loss_sum.scatter_add_(0, example_indices, values * weights)
+        weight_sum.scatter_add_(0, example_indices, weights)
+        global_weight_sum = weight_sum.sum().clamp_min(1.0)
+        return loss_sum * float(batch_size) / global_weight_sum
+
    def _discrete_loss_from_backbone_outputs(
        self,
        batch: dict[str, Tensor],
@@ -1221,28 +992,56 @@ class MolmoAct2Policy(PreTrainedPolicy):
        token_weights = self._discrete_token_weights(valid_positions)
        if reduction == "none":
            example_indices = valid_positions.nonzero(as_tuple=False)[:, 0].to(device=hidden_states.device)
-            ce_loss = _weighted_per_example(
+            ce_loss = self._weighted_per_example(
                token_ce_loss,
                token_weights,
                example_indices,
                int(labels.shape[0]),
            )
        else:
-            ce_loss = _weighted_mean(token_ce_loss, token_weights)
+            ce_loss = self._weighted_mean(token_ce_loss, token_weights)
        if not self.config.softmax_auxiliary_loss:
            return ce_loss, None

        if reduction == "none":
-            z_loss = self.config.softmax_auxiliary_loss_scale * _weighted_per_example(
+            z_loss = self.config.softmax_auxiliary_loss_scale * self._weighted_per_example(
                log_z.pow(2),
                token_weights,
                example_indices,
                int(labels.shape[0]),
            )
        else:
-            z_loss = self.config.softmax_auxiliary_loss_scale * _weighted_mean(log_z.pow(2), token_weights)
+            z_loss = self.config.softmax_auxiliary_loss_scale * self._weighted_mean(
+                log_z.pow(2), token_weights
+            )
        return ce_loss, z_loss

+    @staticmethod
+    def _extract_discrete_token_bins(
+        generated_ids: list[int],
+        start_token_id: int,
+        end_token_id: int,
+        token_id_to_bin: dict[int, int],
+    ) -> list[int]:
+        start_idx = None
+        end_idx = None
+        for idx, token_id in enumerate(generated_ids):
+            if token_id == start_token_id:
+                start_idx = idx
+                break
+        if start_idx is not None:
+            for idx in range(start_idx + 1, len(generated_ids)):
+                if generated_ids[idx] == end_token_id:
+                    end_idx = idx
+                    break
+        span_start = 0 if start_idx is None else start_idx + 1
+        span_end = len(generated_ids) if end_idx is None else end_idx
+        return [
+            int(token_id_to_bin[token_id])
+            for token_id in generated_ids[span_start:span_end]
+            if token_id in token_id_to_bin
+        ]
+
    def _action_token_id_to_bin(self) -> dict[int, int]:
        method = getattr(self.model, "_action_token_id_to_bin", None)
        if callable(method):
@@ -1380,7 +1179,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
        chunks: list[Tensor] = []
        for token_row in generated_token_ids:
            generated_ids = [int(token_id) for token_id in token_row.detach().cpu().tolist()]
-            discrete_token_ids = _extract_discrete_token_bins(
+            discrete_token_ids = self._extract_discrete_token_bins(
                generated_ids,
                int(self.model.config.action_start_token_id),
                int(self.model.config.action_end_token_id),
@@ -1419,7 +1218,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
        model_inputs: dict[str, Tensor],
        action_dim: int,
    ) -> Tensor:
-        model_inputs = _drop_trivial_attention_mask(model_inputs)
+        model_inputs = self._drop_trivial_attention_mask(model_inputs)
        max_steps = self._discrete_generation_max_steps()
        static_cache, attention_bias = self._make_discrete_ar_graph_decode_inputs(
            model_inputs,
@@ -1495,7 +1294,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
            generator=generator,
        )
        if self.config.mask_action_dim_padding:
-            trajectory = _mask_action_dim_tensor(trajectory, action_dim_is_pad)
+            trajectory = self._mask_action_dim_tensor(trajectory, action_dim_is_pad)

        action_context = action_expert.prepare_context(
            encoder_kv_states=encoder_kv_states,
@@ -1528,7 +1327,7 @@ class MolmoAct2Policy(PreTrainedPolicy):
                    modulation=step_modulation,
                )
                if mask_enabled:
-                    velocity = _mask_action_dim_tensor(velocity, action_dim_is_pad)
+                    velocity = self._mask_action_dim_tensor(velocity, action_dim_is_pad)
                return velocity

            if self._rtc_enabled():
@@ -1553,7 +1352,7 @@ class MolmoAct2Policy(PreTrainedPolicy):

            trajectory = trajectory + dt * velocity
            if mask_enabled:
-                trajectory = _mask_action_dim_tensor(trajectory, action_dim_is_pad)
+                trajectory = self._mask_action_dim_tensor(trajectory, action_dim_is_pad)
            if self.rtc_processor is not None and self.rtc_processor.is_debug_enabled():
                self.rtc_processor.track(time=float(flow_timestep[0].item()), x_t=trajectory, v_t=velocity)

@@ -1564,7 +1363,6 @@ class MolmoAct2Policy(PreTrainedPolicy):
        batch: dict[str, Tensor],
        reduction: str = "mean",
    ) -> tuple[Tensor, dict[str, Any]]:
-        """Compute training loss (flow-matching and/or discrete token loss)."""
        if reduction not in {"mean", "none"}:
            raise ValueError(f"Unsupported reduction={reduction!r}. Expected 'mean' or 'none'.")
        model_inputs = self._model_inputs(batch)
@@ -1624,7 +1422,6 @@ class MolmoAct2Policy(PreTrainedPolicy):

    @torch.no_grad()
    def predict_action_chunk(self, batch: dict[str, Tensor], **kwargs) -> Tensor:
-        """Generate an action chunk via continuous flow matching or discrete AR decoding."""
        if "action_mode" in kwargs:
            raise TypeError(
                "MolmoAct2 predict_action_chunk got unexpected keyword argument 'action_mode'; "
@@ -1679,7 +1476,6 @@ class MolmoAct2Policy(PreTrainedPolicy):

    @torch.no_grad()
    def select_action(self, batch: dict[str, Tensor], **kwargs) -> Tensor:
-        """Pop one action step from the queue, regenerating the chunk when empty."""
        if self._rtc_enabled():
            raise AssertionError("RTC is not supported for select_action, use it with predict_action_chunk")
        self.eval()
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,18 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""MolmoAct2 pre/post processing pipeline.
-
-Builds the multimodal prompt (images, discretised state, task text),
-tokenises it via the vendored MolmoAct2 processor, and handles quantile
-normalisation with optional per-dimension gripper masking.
-"""
-
 from __future__ import annotations

 import json
-import logging
-import math
+import os
 import re
 from contextlib import suppress
 from copy import deepcopy
@@ -33,6 +27,7 @@ from typing import TYPE_CHECKING, Any

 import numpy as np
 import torch
+from huggingface_hub import snapshot_download
 from torch import Tensor

 from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
@@ -59,71 +54,14 @@ from lerobot.utils.constants import (
 )
 from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package

-from .configuration_molmoact2 import MolmoAct2Config
-from .modeling_molmoact2 import _hf_token, _resolve_checkpoint_location
-
-logger = logging.getLogger(__name__)
-
-MOLMOACT2_DEFAULT_NUM_IMAGES = 2
-MOLMOACT2_IMAGE_TOKENS_PER_IMAGE = 196
-MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET = 80
-MOLMOACT2_TASK_TOKEN_BUDGET = 32
-MOLMOACT2_SEQUENCE_LENGTH_MARGIN = 32
-MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE = 64
-MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS = 4
-MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP = 6
-MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM = 0.95
-
-
-def _round_up(value: int, multiple: int) -> int:
-    return int(math.ceil(value / multiple) * multiple)
-
-
-def infer_molmoact2_max_sequence_length(
-    *,
-    num_images: int,
-    state_dim: int,
-    action_dim: int,
-    action_horizon: int,
-    include_discrete_action: bool,
-) -> int:
-    """Infer the padded text/image sequence cap from MolmoAct2's fixed token layout."""
-    if num_images < 1:
-        num_images = MOLMOACT2_DEFAULT_NUM_IMAGES
-    if state_dim < 0:
-        state_dim = 0
-    if action_dim < 1:
-        action_dim = 1
-    if action_horizon < 1:
-        action_horizon = 1
-
-    image_tokens = num_images * MOLMOACT2_IMAGE_TOKENS_PER_IMAGE
-    prompt_tokens = (
-        MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET
-        + MOLMOACT2_TASK_TOKEN_BUDGET
-        + state_dim
-        + MOLMOACT2_SEQUENCE_LENGTH_MARGIN
-    )
-    action_tokens = 0
-    if include_discrete_action:
-        action_tokens_per_step = max(
-            MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP,
-            math.ceil(action_dim * MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM),
-        )
-        action_tokens = MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS + action_horizon * action_tokens_per_step
-
-    return _round_up(
-        image_tokens + prompt_tokens + action_tokens,
-        MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE,
-    )
-
+from .configuration_molmoact2 import MolmoAct2Config, infer_molmoact2_max_sequence_length

 if TYPE_CHECKING or _transformers_available:
    from transformers import Qwen2Tokenizer

-    from .molmoact2_hf_model.image_processing_molmoact2 import MolmoAct2ImageProcessor
-    from .molmoact2_hf_model.processing_molmoact2 import MolmoAct2Processor
-    from .molmoact2_hf_model.video_processing_molmoact2 import MolmoAct2VideoProcessor
+    from .hf_model.image_processing_molmoact2 import MolmoAct2ImageProcessor
+    from .hf_model.processing_molmoact2 import MolmoAct2Processor
+    from .hf_model.video_processing_molmoact2 import MolmoAct2VideoProcessor
 else:
    Qwen2Tokenizer = None
    MolmoAct2ImageProcessor = None
@@ -131,7 +69,7 @@ else:
    MolmoAct2VideoProcessor = None

 if TYPE_CHECKING or (_transformers_available and _scipy_available):
-    from .molmoact2_hf_model.action_tokenizer import UniversalActionProcessor
+    from .hf_model.action_tokenizer import UniversalActionProcessor
 else:
    UniversalActionProcessor = None

@@ -159,6 +97,32 @@ _QUESTION_PREFIX_PATTERNS = tuple(
 )


+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_checkpoint_location(
+    checkpoint_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    checkpoint_path = str(checkpoint_path or "").strip()
+    if not checkpoint_path:
+        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
+    local_path = Path(checkpoint_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=checkpoint_path,
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
 def _load_hf_norm_stats_for_tag(
    checkpoint_path: str,
    *,
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,16 +35,16 @@ pytest.importorskip("scipy")
 from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.policies import get_policy_class, make_policy_config
 from lerobot.policies.molmoact2 import (
+    configuration_molmoact2 as molmoact2_config,
    modeling_molmoact2 as molmoact2_modeling,
    processor_molmoact2 as molmoact2_processor,
 )
-from lerobot.policies.molmoact2.configuration_molmoact2 import MolmoAct2Config
-from lerobot.policies.molmoact2.modeling_molmoact2 import (
-    MolmoAct2Policy,
-    _apply_action_chunk_padding_mask,
-    _apply_action_dim_padding_mask,
-    _combine_rollout_seeds,
+from lerobot.policies.molmoact2.configuration_molmoact2 import (
+    MolmoAct2Config,
+    MolmoAct2CosineDecayWithWarmupSchedulerConfig,
+    infer_molmoact2_max_sequence_length,
 )
+from lerobot.policies.molmoact2.modeling_molmoact2 import MolmoAct2Policy
 from lerobot.policies.molmoact2.processor_molmoact2 import (
    MolmoAct2ClampNormalizedProcessorStep,
    MolmoAct2MaskedNormalizerProcessorStep,
@@ -51,7 +53,6 @@ from lerobot.policies.molmoact2.processor_molmoact2 import (
    _add_gripper_masks_to_stats,
    _build_discrete_state_string,
    _normalize_question_text,
-    infer_molmoact2_max_sequence_length,
    make_molmoact2_pre_post_processors,
 )
 from lerobot.policies.rtc.configuration_rtc import RTCConfig
@@ -70,38 +71,34 @@ def test_molmoact2_policy_registration():
    assert cfg.per_episode_seed is False
    assert cfg.eval_seed is None
    assert cfg.normalize_language is True
-    assert cfg.get_scheduler_preset().num_decay_steps == 100_000
+    assert cfg.get_scheduler_preset().num_decay_steps is None
    assert cfg.action_delta_indices == list(range(cfg.chunk_size))
    assert get_policy_class("molmoact2") is MolmoAct2Policy


 def test_molmoact2_checkpoint_download_ignores_remote_python(monkeypatch):
-    import huggingface_hub
-
    download_kwargs = {}

    def fake_snapshot_download(**kwargs):
        download_kwargs.update(kwargs)
        return "/tmp/downloaded-molmoact2"

-    monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
+    monkeypatch.setattr(molmoact2_config, "snapshot_download", fake_snapshot_download)

-    checkpoint_location = molmoact2_modeling._resolve_checkpoint_location("allenai/MolmoAct2")
+    checkpoint_location = molmoact2_config._resolve_checkpoint_location("allenai/MolmoAct2")

    assert checkpoint_location == "/tmp/downloaded-molmoact2"
    assert download_kwargs["ignore_patterns"] == ["*.py", "*.pyc", "__pycache__/*"]


-def test_molmoact2_scheduler_auto_scales_to_training_steps():
-    from lerobot.optim import CosineDecayWithWarmupSchedulerConfig
-
+def test_molmoact2_scheduler_decay_steps_auto_match_training_steps():
    param = torch.nn.Parameter(torch.ones(()))
    optimizer = torch.optim.AdamW([param], lr=0.001)
-    config = CosineDecayWithWarmupSchedulerConfig(
+    config = MolmoAct2CosineDecayWithWarmupSchedulerConfig(
        peak_lr=0.01,
        decay_lr=0.001,
        num_warmup_steps=10,
-        num_decay_steps=100_000,
+        num_decay_steps=None,
    )

    scheduler = config.build(optimizer, num_training_steps=100)
@@ -126,7 +123,9 @@ def test_molmoact2_rollout_generator_uses_eval_seed_per_task():
        batch_size=3,
        device=torch.device("cpu"),
    )
-    expected_first = torch.Generator().manual_seed(_combine_rollout_seeds(first_seed=1000, batch_size=3))
+    expected_first = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1000, batch_size=3)
+    )
    assert torch.allclose(torch.rand(4, generator=first), torch.rand(4, generator=expected_first))

    policy.reset()
@@ -135,7 +134,9 @@ def test_molmoact2_rollout_generator_uses_eval_seed_per_task():
        batch_size=3,
        device=torch.device("cpu"),
    )
-    expected_second = torch.Generator().manual_seed(_combine_rollout_seeds(first_seed=1003, batch_size=3))
+    expected_second = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1003, batch_size=3)
+    )
    assert torch.allclose(torch.rand(4, generator=second), torch.rand(4, generator=expected_second))

    policy.reset()
@@ -144,7 +145,9 @@ def test_molmoact2_rollout_generator_uses_eval_seed_per_task():
        batch_size=3,
        device=torch.device("cpu"),
    )
-    expected_new_task = torch.Generator().manual_seed(_combine_rollout_seeds(first_seed=1000, batch_size=3))
+    expected_new_task = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1000, batch_size=3)
+    )
    assert torch.allclose(torch.rand(4, generator=new_task), torch.rand(4, generator=expected_new_task))


@@ -534,26 +537,36 @@ def test_train_action_expert_only_requires_continuous_action_mode():


 def test_molmoact2_sequence_length_is_inferred_from_fixed_token_budget():
-    assert (
-        infer_molmoact2_max_sequence_length(
-            num_images=2, state_dim=8, action_dim=7, action_horizon=10, include_discrete_action=True
-        )
-        == 640
+    cfg = MolmoAct2Config(
+        action_mode="both",
+        chunk_size=10,
+        n_action_steps=10,
+        image_keys=["observation.images.image", "observation.images.wrist_image"],
+        input_features={OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,))},
+        output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,))},
    )
+
+    assert cfg.max_sequence_length is None
+    assert cfg.inferred_max_sequence_length() == 640
+    assert cfg.inferred_max_sequence_length(include_discrete_action=False) == 576
    assert (
        infer_molmoact2_max_sequence_length(
-            num_images=2, state_dim=8, action_dim=7, action_horizon=10, include_discrete_action=False
-        )
-        == 576
-    )
-    assert (
-        infer_molmoact2_max_sequence_length(
-            num_images=2, state_dim=8, action_dim=7, action_horizon=30, include_discrete_action=True
+            num_images=2,
+            state_dim=8,
+            action_dim=7,
+            action_horizon=30,
+            include_discrete_action=True,
        )
        == 768
    )


+def test_molmoact2_sequence_length_override_is_preserved():
+    cfg = MolmoAct2Config(max_sequence_length=1024)
+
+    assert cfg.inferred_max_sequence_length(num_images=2, state_dim=8, action_dim=7) == 1024
+
+
 def test_train_action_expert_only_freezes_non_action_expert_params():
    class DummyBackbone(torch.nn.Module):
        def __init__(self):
@@ -950,7 +963,7 @@ def test_action_dim_padding_loss_reduces_like_old_trainer():
        ]
    )

-    reduced = _apply_action_dim_padding_mask(loss, action_dim_is_pad)
+    reduced = MolmoAct2Policy._apply_action_dim_padding_mask(loss, action_dim_is_pad)

    expected = torch.stack(
        [
@@ -966,7 +979,7 @@ def test_action_chunk_padding_keeps_old_mean_denominator():
    loss = torch.ones(1, 2, 4, 3)
    action_horizon_is_pad = torch.tensor([[False, False, True, True]])

-    masked = _apply_action_chunk_padding_mask(loss, action_horizon_is_pad)
+    masked = MolmoAct2Policy._apply_action_chunk_padding_mask(loss, action_horizon_is_pad)

    assert masked.mean().item() == 0.5

@@ -768,34 +768,46 @@ wheels = [

 [[package]]
 name = "cmeel-tinyxml2"
-version = "11.0.0"
+version = "10.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "cmeel" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9b/96/4311533fee0a364bb605b585762f04c249f47857b33548a8ea837a7eb860/cmeel_tinyxml2-11.0.0.tar.gz", hash = "sha256:85d9c7680b3369af4c6b40a0dce70bbd84aa67832755622e57eb260cd95abe40", size = 645900, upload-time = "2026-05-21T11:49:32.652Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/28/9f/030eca702c485f7a641f975f167fa93164911b3329f005fb0730ff5e793f/cmeel_tinyxml2-10.0.0.tar.gz", hash = "sha256:00252aefc1c94a55b89f25ad08ee79fda2da8d1d94703e051598ddb52a9088fe", size = 645297, upload-time = "2025-02-06T10:29:00.106Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/f0/90c1640c53b623359d75ab1c70bdf19dc0afe82722bc5df57d09f8eaf83a/cmeel_tinyxml2-11.0.0-0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:b0bd974e549b8c444626671a8e645897603ebf5225734cbe04a9dd3461477754", size = 111719, upload-time = "2026-05-21T11:49:25.999Z" },
-    { url = "https://files.pythonhosted.org/packages/56/40/166447150a31bc3b794ffb493d5a634f67ffbc75dd8b4c46373701b7ef15/cmeel_tinyxml2-11.0.0-0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9a1406f408262c37ae7c4566b1d67801c4b10c4980903fb1ef0ba45fa4407072", size = 109146, upload-time = "2026-05-21T11:49:27.829Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/ca/3cc665afe2d76999f15454bb3b2f7c05f0088ad7de35718648291a536fd9/cmeel_tinyxml2-11.0.0-0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6f830007917c3e36f26b27d170ce84a619a62f46104d3cce435dff0125dd665f", size = 157109, upload-time = "2026-05-21T11:49:29.358Z" },
-    { url = "https://files.pythonhosted.org/packages/87/4e/dcc0d9756d93be734d824e2a570cc9ac68909a1d7d3b6fc87c2fb32726c0/cmeel_tinyxml2-11.0.0-0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:18674156bd41f3993dc1d5199da04fa496674358daa6588090fb9f86c71917b0", size = 148825, upload-time = "2026-05-21T11:49:31.035Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/5d/bc3a932eb7996a0a789979426a9bb8a3948bf57f3f17bab87dddbef62433/cmeel_tinyxml2-10.0.0-0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:924499bb1b60b9a17bd001d12a9af88ddbee4ca888638ae684ba7f0f3ce49e87", size = 111913, upload-time = "2025-02-06T10:28:45.723Z" },
+    { url = "https://files.pythonhosted.org/packages/92/bf/67d11e123313c034712896e94038291fe506bb099bdb75a136392002ffd0/cmeel_tinyxml2-10.0.0-0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:26a1eb30c2a00bfc172e89ed015a18b8efb2b383546252ca8859574aed684686", size = 109487, upload-time = "2025-02-06T10:28:47.546Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/48/d8c81ce19b4b278ed0e8f81f93ae8670209bf3a9ac20141b9c386bb40cc7/cmeel_tinyxml2-10.0.0-0-py3-none-manylinux_2_17_i686.whl", hash = "sha256:53d86e02864c712f51f9a9adfcd8b6046b2ed51d44a0c34a8438d93b72b48325", size = 160118, upload-time = "2025-02-06T10:28:49.627Z" },
+    { url = "https://files.pythonhosted.org/packages/87/4e/62193e27c9581f8ba7aeaeca7805632a64f2f4a824b1db37ad02ee953e8a/cmeel_tinyxml2-10.0.0-0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:74112e2e9473afbf6ee2d25c9942553e9f6a40465e714533db72db48bc7658e1", size = 158477, upload-time = "2025-02-06T10:28:51.667Z" },
+    { url = "https://files.pythonhosted.org/packages/14/f9/d0420c39e9ade99beeec61cd3abc68880fe6e14d85e9df292af8fabe65c8/cmeel_tinyxml2-10.0.0-0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ecd6e99caa2a06ac0d4b333b740c20fca526d0ca426f99eb5c0a0039117afdb6", size = 147025, upload-time = "2025-02-06T10:28:53.944Z" },
+    { url = "https://files.pythonhosted.org/packages/66/9e/df63147fc162ab487217fa5596778ab7a81a82d9b3ce4236fd3a1e48cecb/cmeel_tinyxml2-10.0.0-0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:30993fffb7032a45d5d3b1e5670cb879dad667a13144cd68c8f4e0371a8a3d2e", size = 150958, upload-time = "2025-02-06T10:28:55.301Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/a8/b03567275fd83f5af33ddb61de942689dec72c5b21bec01e6a5b11101aa5/cmeel_tinyxml2-10.0.0-0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:8c09ede51784af54211a6225884dc7ddbb02ea1681656d173060c7ad2a5b9a3c", size = 160300, upload-time = "2025-02-06T10:28:57.189Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/ec/2781635b66c1059ca1243ae0f5a0410e171a5d8b8a71be3e34cb172f9f2d/cmeel_tinyxml2-10.0.0-0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3bd511d6d0758224efdebc23d3ead6e94f0755b04141ebf7d5493377829e8332", size = 149184, upload-time = "2025-02-06T10:28:58.734Z" },
 ]

 [[package]]
 name = "cmeel-urdfdom"
-version = "6.0.0"
+version = "4.0.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "cmeel" },
    { name = "cmeel-console-bridge" },
    { name = "cmeel-tinyxml2" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/21/75/4e8aff079e98582aeeb8e752805081da0c2dea405e79bafeefb555defe9f/cmeel_urdfdom-6.0.0.tar.gz", hash = "sha256:65c0fdc6021300fc55b2d0c03ab64dedc328034a74e40498e671bc894bb1dcf7", size = 303688, upload-time = "2026-05-21T12:08:56.663Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/31/09/be81a5e7db56f34b6ccdbe7afe855c95a18c8439e173519e0146e9276a8c/cmeel_urdfdom-4.0.1.tar.gz", hash = "sha256:2e3f41e8483889e195b574acb326a4464cf11a3c0a8724031ac28bcda2223efc", size = 291511, upload-time = "2025-02-12T12:07:09.699Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f2/40/51ba667135f01631179eee1614557193f8453740f248302d1b8b7f9f693e/cmeel_urdfdom-6.0.0-0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:53d55cebb137a6e4dac6c16fa53f2dc2b7b9b5cda644bd1637a5bb849cd96e52", size = 381501, upload-time = "2026-05-21T12:08:48.758Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/d1/2b49a8c940fa75abc13df9842c14e577e6a82d5854b6d52597ce3bb04894/cmeel_urdfdom-6.0.0-0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0ef424735bd30f4afa4d1b4ddca9b297498c43005ddd775c080e55f62e9e0466", size = 377159, upload-time = "2026-05-21T12:08:50.485Z" },
-    { url = "https://files.pythonhosted.org/packages/db/ac/0efde3a48220b55707bafb6d2e2dcca562f99dcd5c2c15311f7696eeacce/cmeel_urdfdom-6.0.0-0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:0436f5230f1484c8e583284ef48c7291b230ada3dc5fb2937941f582e72409ec", size = 506000, upload-time = "2026-05-21T12:08:52.273Z" },
-    { url = "https://files.pythonhosted.org/packages/32/d4/dfd617e598100e4e53ae3d228a968facff80bae53038fb18e2dccb1ab03a/cmeel_urdfdom-6.0.0-0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:7ab1be680a8ec866d5422c617b641d1f0e38774061df28b8b426fb26edce6337", size = 530049, upload-time = "2026-05-21T12:08:54.224Z" },
+    { url = "https://files.pythonhosted.org/packages/be/d0/20147dd6bb723afc44a58d89ea624df2bad1bed7b898a2df112aaca4a479/cmeel_urdfdom-4.0.1-0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:2fe56939c6b47f6ec57021aac154123da47ecdcd79a217f3a5e3c4b705a07dee", size = 300860, upload-time = "2025-02-12T12:06:58.536Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/98/f832bca347e2d987c6b0ebb6930caf7b2c402535324aeed466b6aa2c4513/cmeel_urdfdom-4.0.1-0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:00a0aba78b68c428b27abeed1db58d73e65319ed966911a0e97b37367442e756", size = 300616, upload-time = "2025-02-12T12:07:00.556Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/10/bf5765b6f388037cff166a754a0958ac2fee34ca3c0975ef64d0324e4647/cmeel_urdfdom-4.0.1-0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:a701a8f9671331f11b18ecf37a6537db546a21e6a0e5d0ff53341fea0693ed7f", size = 385951, upload-time = "2025-02-12T12:07:02.556Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/82/cb3f8f587d293a17bdbea15b50cdaa4a1e28e04583eb4cb4821685b89466/cmeel_urdfdom-4.0.1-0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:12e39fc388c077d79fc9b3841d3d972a1da90b90de754d3363194c1540e18abf", size = 399619, upload-time = "2025-02-12T12:07:04.388Z" },
+    { url = "https://files.pythonhosted.org/packages/24/77/322d7ac92c692d8dfaeda9de2d937087d15e2b564dc457d656e5fde3991d/cmeel_urdfdom-4.0.1-0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c4a83925df1d5923c4485c3eb2b80b3a61b14f119ab724fb5bd04cec494690ee", size = 373969, upload-time = "2025-02-12T12:07:06.222Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/63/bdc6b55cc8bd99bb9dce6be801b30feffaa1c3841ecb7f4fe4d137424518/cmeel_urdfdom-4.0.1-0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4c4f44270971b3d05c45a4e21b1fb2df7e05a750363ae918f59532bff0bfe0e1", size = 388237, upload-time = "2025-02-12T12:07:08.326Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/2d/8463fc23230612daf4da1e31d3229f47708381f3ae4d1500f0f007ac0f92/cmeel_urdfdom-4.0.1-1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:f7535158f45992eb2ba79e90d9db1bf9adc3846d9c7ed3e7a8c1c4d5343afa37", size = 301006, upload-time = "2025-02-13T11:42:08.8Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/d5/c8cdf500e49300d85624cbc3ef804107ddcdc9c541b1d3f726bfb58a9fc1/cmeel_urdfdom-4.0.1-1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fef2a01a00d61d41b3d35dd4958bba973e9025c26eea1d3c9880932f4dba89a5", size = 300758, upload-time = "2025-02-13T11:42:10.449Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/b3/2f7bac1544113a7f8e0f6d8b1fab5e75c6a3d27ffbb584b03267251b2165/cmeel_urdfdom-4.0.1-1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7a52eb36950ce982014d99a55717ca29985da056e3705f20746f15d3244c1f7a", size = 386043, upload-time = "2025-02-13T11:42:11.923Z" },
+    { url = "https://files.pythonhosted.org/packages/86/03/8bdeb36ba6a3e8125d523ecfc010403049e463fe589f9896858d4bdcaf1e/cmeel_urdfdom-4.0.1-1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:9f3b9c80b10d7246821ff61c2573f799e3da23d483e6f7367ddcad8a48baf58f", size = 399719, upload-time = "2025-02-13T11:42:14.325Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/ed/43f99e7512460294cd8acc5753ba25f8a20bdf28d62e143eaf3ec7a28bb6/cmeel_urdfdom-4.0.1-1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2de69f47e8312cc09157624802d5bdaad6406443f863fb4b9ec62a19b4de3c72", size = 374073, upload-time = "2025-02-13T11:42:17.907Z" },
+    { url = "https://files.pythonhosted.org/packages/17/c6/2e9bde6d7c02c1cf203ea896f8ce1afd441412f09b44830f1ee4a96d77de/cmeel_urdfdom-4.0.1-1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7708c1402de450fbeab21f7ca264a9a4676ed4c1cdf8d84d840bc5d057aac920", size = 388337, upload-time = "2025-02-13T11:42:19.657Z" },
 ]

 [[package]]
@@ -1147,7 +1159,7 @@ name = "decord"
 version = "0.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "(platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "numpy", marker = "(platform_machine != 'arm64' and platform_machine != 's390x' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 's390x' and sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" },
@@ -2788,6 +2800,8 @@ accelerate-dep = [
 all = [
    { name = "accelerate" },
    { name = "av" },
+    { name = "cmeel-tinyxml2" },
+    { name = "cmeel-urdfdom" },
    { name = "contourpy" },
    { name = "datasets" },
    { name = "debugpy" },
@@ -2971,6 +2985,8 @@ hardware = [
 ]
 hilserl = [
    { name = "av" },
+    { name = "cmeel-tinyxml2" },
+    { name = "cmeel-urdfdom" },
    { name = "datasets" },
    { name = "grpcio" },
    { name = "gym-hil" },
@@ -2993,6 +3009,8 @@ intelrealsense = [
    { name = "pyrealsense2-macosx", marker = "sys_platform == 'darwin'" },
 ]
 kinematics = [
+    { name = "cmeel-tinyxml2" },
+    { name = "cmeel-urdfdom" },
    { name = "placo" },
 ]
 lekiwi = [
@@ -3066,6 +3084,8 @@ pi = [
    { name = "transformers" },
 ]
 placo-dep = [
+    { name = "cmeel-tinyxml2" },
+    { name = "cmeel-urdfdom" },
    { name = "placo" },
 ]
 pusht = [
@@ -3186,6 +3206,8 @@ requires-dist = [
    { name = "accelerate", marker = "extra == 'accelerate-dep'", specifier = ">=1.14.0,<2.0.0" },
    { name = "av", marker = "extra == 'av-dep'", specifier = ">=15.0.0,<16.0.0" },
    { name = "cmake", specifier = ">=3.29.0.1,<4.2.0" },
+    { name = "cmeel-tinyxml2", marker = "extra == 'placo-dep'", specifier = "<11" },
+    { name = "cmeel-urdfdom", marker = "extra == 'placo-dep'", specifier = ">=4,<5" },
    { name = "contourpy", marker = "extra == 'matplotlib-dep'", specifier = ">=1.3.0,<2.0.0" },
    { name = "datasets", marker = "extra == 'dataset'", specifier = ">=4.7.0,<5.0.0" },
    { name = "debugpy", marker = "extra == 'dev'", specifier = ">=1.8.1,<1.9.0" },