fix(smolvla2): use RobotConfig.max_relative_target, drop --max_action_norm

The hand-rolled action-norm safety clip duplicated what every
``RobotConfig`` already exposes — ``max_relative_target`` — and at
the wrong layer (after postprocess but before send_action, instead
of inside the robot driver where every other lerobot entry point
puts it). The norm clip also rejected entire actions instead of
clipping per-motor relative motion, so a single rogue joint would
kill the whole tick.

Replace with ``--robot.max_relative_target``: a string parsed as
either a bare float (uniform per-motor cap) or a JSON object
mapping motor name → cap. Passed through to
``RobotConfig(max_relative_target=...)`` at robot construction;
the driver's ``send_action`` clips each commanded joint position
relative to the current measured one before issuing it on the bus —
same behaviour ``lerobot-record`` ships.

Also bump ``--chunk_hz`` default from ``4.0`` to ``1.0``. One new
chunk per second is what the trained checkpoint can comfortably
keep up with on common hardware and gives smoother motion than
sub-second chunk regenerations (no RTC interpolation between
chunks yet).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-12 11:41:57 +02:00
parent 33a4b4a5a0
commit c661d81409
+46 -22
View File
@@ -180,14 +180,19 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
),
)
p.add_argument(
"--max_action_norm",
dest="max_action_norm",
type=float,
"--robot.max_relative_target",
dest="robot_max_relative_target",
type=str,
default=None,
help=(
"Safety clip: reject any individual action whose L2 norm "
"exceeds this value. Default ``None`` = no clipping. Useful "
"as a kill-switch when bringing up a new robot/task pair."
"Safety clip on per-motor relative motion, passed through to "
"``RobotConfig.max_relative_target``. Accepts either a float "
"(applied to every motor — e.g. ``5.0`` degrees) or a JSON "
"object mapping motor names to caps "
"(e.g. ``'{\"shoulder_pan\": 5, \"gripper\": 30}'``). The "
"robot driver clips each commanded position relative to the "
"current measured position before sending — same kill-switch "
"``lerobot-record`` uses. Default ``None`` = no clipping."
),
)
p.add_argument(
@@ -213,7 +218,16 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
help="Pocket-tts voice name (or path to a .wav for cloning).",
)
p.add_argument(
"--chunk_hz", type=float, default=4.0, help="Action-chunk generation rate."
"--chunk_hz",
type=float,
default=1.0,
help=(
"Action-chunk generation rate (Hz). Default ``1.0`` — one "
"new chunk per second. Lower = less inference cost / "
"smoother behaviour but longer reaction time to changes. "
"Higher = fresher actions / more inference cost; cap at "
"~1/(forward-pass latency)."
),
)
p.add_argument(
"--ctrl_hz", type=float, default=50.0, help="Action dispatch rate."
@@ -427,12 +441,16 @@ def _build_robot(
robot_port: str | None,
robot_id: str | None,
robot_cameras_json: str | None,
robot_max_relative_target: str | None,
):
"""Build and connect a robot from CLI args.
Mirrors how ``lerobot-record`` builds a robot but takes the args
flat from argparse instead of through draccus, so the runtime
keeps its plain ``--key=value`` CLI surface.
keeps its plain ``--key=value`` CLI surface. ``max_relative_target``
is passed through to the RobotConfig — the driver itself clips each
commanded joint position relative to the current measured one
before issuing it on the bus.
"""
import json # noqa: PLC0415
@@ -454,6 +472,21 @@ def _build_robot(
raise ValueError(
f"--robot.cameras must be a JSON object, got {robot_cameras_json!r}: {exc}"
) from exc
if robot_max_relative_target:
# Accept either a bare float (uniform cap) or a JSON object
# (per-motor cap). Matches ``RobotConfig.max_relative_target``'s
# ``float | dict[str, float] | None`` shape.
s = robot_max_relative_target.strip()
try:
if s.startswith("{"):
kwargs["max_relative_target"] = json.loads(s)
else:
kwargs["max_relative_target"] = float(s)
except (json.JSONDecodeError, ValueError) as exc:
raise ValueError(
f"--robot.max_relative_target must be a float or JSON dict, "
f"got {robot_max_relative_target!r}: {exc}"
) from exc
cfg = cls(**kwargs)
robot = make_robot_from_config(cfg)
robot.connect()
@@ -518,15 +551,15 @@ def _build_robot_action_executor(
robot,
postprocessor: Any,
ds_meta: Any,
max_action_norm: float | None,
) -> Callable[[Any], None]:
"""Closure that postprocesses an action and dispatches to the robot.
Mirrors ``lerobot-record``'s ``predict_action`` tail: postprocess
(denormalise) → ``make_robot_action`` (tensor → ``{joint: value}``
dict) → ``robot.send_action(...)``. Optional safety clip on the
action's L2 norm acts as a kill switch when bringing up a new
robot/task pair.
dict) → ``robot.send_action(...)``. Safety clipping happens *inside*
``robot.send_action`` via the driver's ``max_relative_target``
cap (passed in at ``RobotConfig`` construction time) — same place
``lerobot-record`` enforces it.
"""
import torch # noqa: PLC0415
@@ -537,15 +570,6 @@ def _build_robot_action_executor(
if postprocessor is not None:
action = postprocessor(action)
if isinstance(action, torch.Tensor):
if max_action_norm is not None:
norm = float(action.float().norm().item())
if norm > max_action_norm:
logger.warning(
"action norm %.3f > max_action_norm=%.3f"
"rejecting tick",
norm, max_action_norm,
)
return
if action.ndim > 1 and action.shape[0] == 1:
action = action.squeeze(0)
action_dict = make_robot_action(action, ds_meta.features)
@@ -745,6 +769,7 @@ def main(argv: list[str] | None = None) -> int:
robot_port=args.robot_port,
robot_id=args.robot_id,
robot_cameras_json=args.robot_cameras,
robot_max_relative_target=args.robot_max_relative_target,
)
observation_provider = _build_robot_observation_provider(
robot=robot,
@@ -756,7 +781,6 @@ def main(argv: list[str] | None = None) -> int:
robot=robot,
postprocessor=postprocessor,
ds_meta=ds_meta,
max_action_norm=args.max_action_norm,
)
elif args.dataset_repo_id is not None:
print(