mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-25 05:29:55 +00:00
fix(smolvla2): make HighLevelSubtaskFwd actually fire at low hz + quiet startup log
Two runtime fixes that surfaced from on-robot testing. (1) HighLevelSubtaskFwd was double-gated: HzTrigger fires every period (e.g. every 5s at --high_level_hz=0.2) AND the step requires the action queue to be empty. The queue-empty window is brief (~tens of ms between drain and refill) and almost never coincides with the low-hz timer, so HL effectively never fired and the subtask shown in the runtime panel stayed on the dataset's frame-0 annotation. Add HzTrigger.rearm() and have HighLevelSubtaskFwd call it when skipping due to queue-non-empty — the trigger stays armed and tries again on the next tick instead of waiting another full period. LowLevelForward keeps the original "skip" semantics because chunk_hz is meant as a true upper bound on chunk-generation rate. (2) The "robot state at startup" warning in _build_robot_observation_provider was meant to fire once but wasn't gated by _resize_logged like the sibling "camera ... live=AxB" warning. Result: it spammed every observation tick (~1-2s). Gate it on first_call (snapshot of _resize_logged["done"]) so both logs fire once at session start. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -463,8 +463,16 @@ class HighLevelSubtaskFwd(InferenceStep):
|
||||
# of 30/sec and the robot barely moves. Tying it to the same
|
||||
# "queue empty" condition as the chunk refresh produces a
|
||||
# clean sense → think → act cycle.
|
||||
#
|
||||
# Rearm the trigger when skipping so a low-hz schedule
|
||||
# (e.g. ``--high_level_hz=0.2`` = once per 5 s) doesn't lose
|
||||
# the slot: the trigger fires once on the timer but the brief
|
||||
# queue-empty window almost never coincides, so without rearm
|
||||
# HL would effectively never run.
|
||||
queue = state.get("action_queue") or []
|
||||
if len(queue) > 0:
|
||||
if hasattr(self.trigger, "rearm"):
|
||||
self.trigger.rearm()
|
||||
return None
|
||||
ctx = _msgs_for_subtask(state)
|
||||
observation = _maybe_observation(self.observation_provider)
|
||||
|
||||
@@ -82,7 +82,15 @@ class Trigger(Protocol):
|
||||
|
||||
@dataclass
|
||||
class HzTrigger:
|
||||
"""Fire at most ``hz`` times per second."""
|
||||
"""Fire at most ``hz`` times per second.
|
||||
|
||||
A step that gates further (e.g. ``HighLevelSubtaskFwd`` skipping
|
||||
when the action queue is non-empty) and wants the trigger to
|
||||
retry next tick instead of waiting a full period can call
|
||||
:meth:`rearm` from inside ``run``. Without this, a low-hz trigger
|
||||
(e.g. ``hz=0.2`` = once per 5 s) almost never coincides with the
|
||||
brief queue-empty window and the step never fires at all.
|
||||
"""
|
||||
|
||||
hz: float
|
||||
_last_seconds: float | None = field(default=None, init=False)
|
||||
@@ -94,6 +102,15 @@ class HzTrigger:
|
||||
return True
|
||||
return False
|
||||
|
||||
def rearm(self) -> None:
|
||||
"""Mark the trigger as not having fired, so the next tick re-evaluates.
|
||||
|
||||
Used by a step that decided to skip after ``should_fire`` already
|
||||
committed the firing — keeps the cadence honest without losing
|
||||
the slot.
|
||||
"""
|
||||
self._last_seconds = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EventTrigger:
|
||||
|
||||
@@ -772,6 +772,11 @@ def _build_robot_observation_provider(
|
||||
import cv2 as _cv2 # noqa: PLC0415
|
||||
import numpy as _np # noqa: PLC0415
|
||||
|
||||
# Snapshot the gate state at the start of the call: the
|
||||
# camera info and startup-state warnings are meant to fire
|
||||
# exactly once (operator sanity check), so gate them on
|
||||
# the *previous* value rather than the post-loop value.
|
||||
first_call = not _resize_logged["done"]
|
||||
for cam_key, (target_h, target_w) in target_image_shapes.items():
|
||||
img = raw.get(cam_key)
|
||||
if img is None or not isinstance(img, _np.ndarray):
|
||||
@@ -779,7 +784,7 @@ def _build_robot_observation_provider(
|
||||
if img.ndim != 3:
|
||||
continue
|
||||
cur_h, cur_w = img.shape[:2]
|
||||
if not _resize_logged["done"]:
|
||||
if first_call:
|
||||
logger.warning(
|
||||
"camera %s: live=%dx%d, training=%dx%d (resize=%s)",
|
||||
cam_key,
|
||||
@@ -793,13 +798,14 @@ def _build_robot_observation_provider(
|
||||
continue
|
||||
raw[cam_key] = _cv2.resize(img, (target_w, target_h), interpolation=_cv2.INTER_AREA)
|
||||
_resize_logged["done"] = True
|
||||
# Also print the state vector once so the operator
|
||||
# can eyeball it against the dataset's stats. State
|
||||
# OOD is a real failure mode for VLAs — the prefix
|
||||
# carries state via the projection layer, and a
|
||||
# neutral home pose can easily sit a couple σ off
|
||||
# the supervised support region.
|
||||
if "observation.state" in (ds_features or {}):
|
||||
# Print the state vector once so the operator can eyeball
|
||||
# it against the dataset's stats. State OOD is a real
|
||||
# failure mode for VLAs — the prefix carries state via
|
||||
# the projection layer, and a neutral home pose can
|
||||
# easily sit a couple σ off the supervised support
|
||||
# region. Gated on ``first_call`` so this doesn't spam
|
||||
# every observation tick.
|
||||
if first_call and "observation.state" in (ds_features or {}):
|
||||
state_names = ds_features["observation.state"].get("names") or []
|
||||
state_vals = [raw.get(n) for n in state_names]
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user