fix(smolvla2): use RobotConfig.max_relative_target, drop --max_action_norm

The hand-rolled action-norm safety clip duplicated what every ``RobotConfig`` already exposes — ``max_relative_target`` — and at the wrong layer (after postprocess but before send_action, instead of inside the robot driver where every other lerobot entry point puts it). The norm clip also rejected entire actions instead of clipping per-motor relative motion, so a single rogue joint would kill the whole tick. Replace with ``--robot.max_relative_target``: a string parsed as either a bare float (uniform per-motor cap) or a JSON object mapping motor name → cap. Passed through to ``RobotConfig(max_relative_target=...)`` at robot construction; the driver's ``send_action`` clips each commanded joint position relative to the current measured one before issuing it on the bus — same behaviour ``lerobot-record`` ships. Also bump ``--chunk_hz`` default from ``4.0`` to ``1.0``. One new chunk per second is what the trained checkpoint can comfortably keep up with on common hardware and gives smoother motion than sub-second chunk regenerations (no RTC interpolation between chunks yet). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 17:20:05 +00:00 · 2026-05-12 11:41:57 +02:00
parent 33a4b4a5a0
commit c661d81409
1 changed files with 46 additions and 22 deletions
@@ -180,14 +180,19 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
        ),
    )
    p.add_argument(
-        "--max_action_norm",
-        dest="max_action_norm",
-        type=float,
+        "--robot.max_relative_target",
+        dest="robot_max_relative_target",
+        type=str,
        default=None,
        help=(
-            "Safety clip: reject any individual action whose L2 norm "
-            "exceeds this value. Default ``None`` = no clipping. Useful "
-            "as a kill-switch when bringing up a new robot/task pair."
+            "Safety clip on per-motor relative motion, passed through to "
+            "``RobotConfig.max_relative_target``. Accepts either a float "
+            "(applied to every motor — e.g. ``5.0`` degrees) or a JSON "
+            "object mapping motor names to caps "
+            "(e.g. ``'{\"shoulder_pan\": 5, \"gripper\": 30}'``). The "
+            "robot driver clips each commanded position relative to the "
+            "current measured position before sending — same kill-switch "
+            "``lerobot-record`` uses. Default ``None`` = no clipping."
        ),
    )
    p.add_argument(
@@ -213,7 +218,16 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
        help="Pocket-tts voice name (or path to a .wav for cloning).",
    )
    p.add_argument(
-        "--chunk_hz", type=float, default=4.0, help="Action-chunk generation rate."
+        "--chunk_hz",
+        type=float,
+        default=1.0,
+        help=(
+            "Action-chunk generation rate (Hz). Default ``1.0`` — one "
+            "new chunk per second. Lower = less inference cost / "
+            "smoother behaviour but longer reaction time to changes. "
+            "Higher = fresher actions / more inference cost; cap at "
+            "~1/(forward-pass latency)."
+        ),
    )
    p.add_argument(
        "--ctrl_hz", type=float, default=50.0, help="Action dispatch rate."
@@ -427,12 +441,16 @@ def _build_robot(
    robot_port: str | None,
    robot_id: str | None,
    robot_cameras_json: str | None,
+    robot_max_relative_target: str | None,
 ):
    """Build and connect a robot from CLI args.

    Mirrors how ``lerobot-record`` builds a robot but takes the args
    flat from argparse instead of through draccus, so the runtime
-    keeps its plain ``--key=value`` CLI surface.
+    keeps its plain ``--key=value`` CLI surface. ``max_relative_target``
+    is passed through to the RobotConfig — the driver itself clips each
+    commanded joint position relative to the current measured one
+    before issuing it on the bus.
    """
    import json  # noqa: PLC0415

@@ -454,6 +472,21 @@ def _build_robot(
            raise ValueError(
                f"--robot.cameras must be a JSON object, got {robot_cameras_json!r}: {exc}"
            ) from exc
+    if robot_max_relative_target:
+        # Accept either a bare float (uniform cap) or a JSON object
+        # (per-motor cap). Matches ``RobotConfig.max_relative_target``'s
+        # ``float | dict[str, float] | None`` shape.
+        s = robot_max_relative_target.strip()
+        try:
+            if s.startswith("{"):
+                kwargs["max_relative_target"] = json.loads(s)
+            else:
+                kwargs["max_relative_target"] = float(s)
+        except (json.JSONDecodeError, ValueError) as exc:
+            raise ValueError(
+                f"--robot.max_relative_target must be a float or JSON dict, "
+                f"got {robot_max_relative_target!r}: {exc}"
+            ) from exc
    cfg = cls(**kwargs)
    robot = make_robot_from_config(cfg)
    robot.connect()
@@ -518,15 +551,15 @@ def _build_robot_action_executor(
    robot,
    postprocessor: Any,
    ds_meta: Any,
-    max_action_norm: float | None,
 ) -> Callable[[Any], None]:
    """Closure that postprocesses an action and dispatches to the robot.

    Mirrors ``lerobot-record``'s ``predict_action`` tail: postprocess
    (denormalise) → ``make_robot_action`` (tensor → ``{joint: value}``
-    dict) → ``robot.send_action(...)``. Optional safety clip on the
-    action's L2 norm acts as a kill switch when bringing up a new
-    robot/task pair.
+    dict) → ``robot.send_action(...)``. Safety clipping happens *inside*
+    ``robot.send_action`` via the driver's ``max_relative_target``
+    cap (passed in at ``RobotConfig`` construction time) — same place
+    ``lerobot-record`` enforces it.
    """
    import torch  # noqa: PLC0415

@@ -537,15 +570,6 @@ def _build_robot_action_executor(
            if postprocessor is not None:
                action = postprocessor(action)
            if isinstance(action, torch.Tensor):
-                if max_action_norm is not None:
-                    norm = float(action.float().norm().item())
-                    if norm > max_action_norm:
-                        logger.warning(
-                            "action norm %.3f > max_action_norm=%.3f — "
-                            "rejecting tick",
-                            norm, max_action_norm,
-                        )
-                        return
                if action.ndim > 1 and action.shape[0] == 1:
                    action = action.squeeze(0)
                action_dict = make_robot_action(action, ds_meta.features)
@@ -745,6 +769,7 @@ def main(argv: list[str] | None = None) -> int:
            robot_port=args.robot_port,
            robot_id=args.robot_id,
            robot_cameras_json=args.robot_cameras,
+            robot_max_relative_target=args.robot_max_relative_target,
        )
        observation_provider = _build_robot_observation_provider(
            robot=robot,
@@ -756,7 +781,6 @@ def main(argv: list[str] | None = None) -> int:
            robot=robot,
            postprocessor=postprocessor,
            ds_meta=ds_meta,
-            max_action_norm=args.max_action_norm,
        )
    elif args.dataset_repo_id is not None:
        print(