diff --git a/src/lerobot/configs/recipes/hirobot.yaml b/src/lerobot/configs/recipes/hirobot.yaml index 3ee88690a..2fae907e1 100644 --- a/src/lerobot/configs/recipes/hirobot.yaml +++ b/src/lerobot/configs/recipes/hirobot.yaml @@ -3,11 +3,11 @@ # # Trains two things only: subtasks and VQA. Plan and memory are # intentionally left out for now — keeps the prompt short and the -# training surface small while the core action loop is validated. +# training surface small while the core subtask + action loop is +# validated. # -# high_level_subtask — predict the subtask from the task (text -# head only; not on the inference path yet). -# low_level_execution — flow loss with [images, task, state]. +# high_level_subtask — predict the subtask from the task. +# low_level_execution — flow loss with [images, subtask, state]. # ask_vqa_{top,wrist} — camera-grounded VQA. # # Each backbone's text tokenizer renders these messages differently @@ -25,15 +25,14 @@ blend: low_level_execution: weight: 0.40 messages: - # The action expert is conditioned on the TASK (not the subtask). - # The task is always available at inference with no high-level - # generation loop, so this removes the train/inference mismatch - # that a subtask-conditioned action head would have while there - # is no reliable runtime subtask source. ``high_level_subtask`` - # still trains the text head to predict subtasks for later use. - # ``stream: low_level`` flips ``predict_actions=True`` so the - # flow loss fires; no text-CE target here. - - {role: user, content: "${task}", stream: low_level} + # The action expert is conditioned on the SUBTASK — at inference + # the high-level loop (``HighLevelSubtaskFwd``) generates the + # subtask via the LM head and feeds it here. The action expert's + # prefix is [images, subtask, state]. ``stream: low_level`` flips + # ``predict_actions=True`` so the flow loss fires; no text-CE + # target here (subtask prediction is owned by + # ``high_level_subtask``). + - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} ask_vqa_top: weight: 0.10 diff --git a/src/lerobot/policies/smolvla2/inference/runtime.py b/src/lerobot/policies/smolvla2/inference/runtime.py index 24b32a940..afc7cca18 100644 --- a/src/lerobot/policies/smolvla2/inference/runtime.py +++ b/src/lerobot/policies/smolvla2/inference/runtime.py @@ -30,6 +30,7 @@ from .steps import ( AskVQAFwd, DispatchAction, DispatchToolCalls, + HighLevelSubtaskFwd, InferenceStep, LowLevelForward, ) @@ -66,24 +67,29 @@ class SmolVLA2Runtime: _stop: bool = field(default=False, init=False) def __post_init__(self) -> None: - # VQA-only configuration (current scope). The training recipe - # supervises only subtasks + VQA — plan and memory are out for - # now — so the runtime drops the high-level subtask / - # memory-update / interjection steps. The remaining loop is: + # Subtask + VQA configuration (current scope — plan and memory + # are not trained yet). Pipeline: # - # AskVQAFwd → answer camera-grounded questions on stdin - # LowLevelForward → action chunk (conditioned on the task - # string directly, since no subtask is - # being generated — see LowLevelForward's - # ``current_subtask or task`` fallback) - # DispatchAction → drain the chunk to the robot - # DispatchToolCalls → fire any pending tool calls + # HighLevelSubtaskFwd → generate the next subtask via the LM + # head at ~``high_level_hz``; writes + # ``current_subtask`` + # AskVQAFwd → answer camera-grounded stdin questions + # LowLevelForward → action chunk conditioned on the + # generated ``current_subtask`` + # DispatchAction → drain the chunk to the robot + # DispatchToolCalls → fire any pending tool calls # - # ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` / - # ``UserInterjectionFwd`` are still importable from - # ``inference.steps`` — re-add them here once plan / memory / - # subtask generation is back in scope. + # Order matters: ``HighLevelSubtaskFwd`` and ``LowLevelForward`` + # are both gated on "action queue empty", so the subtask must + # refresh *before* the chunk that consumes it. ``MemoryUpdateFwd`` + # / ``UserInterjectionFwd`` are still importable from + # ``inference.steps`` — re-add once plan / memory are in scope. self.pipeline = [ + HighLevelSubtaskFwd( + trigger=HzTrigger(self.high_level_hz), + policy=self.policy, + observation_provider=self.observation_provider, + ), AskVQAFwd( policy=self.policy, observation_provider=self.observation_provider, diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index b255e7417..c9b84b167 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -111,12 +111,14 @@ class LowLevelForward(InferenceStep): if observation is None: return None - # The action expert is conditioned on the TASK string — the - # ``low_level_execution`` recipe renders ``user(${task})``. - # The task is stable for the whole episode and always present, - # so there is no train/inference mismatch and no dependency on - # a (currently unreliable) high-level subtask generator. - ctx = [{"role": "user", "content": state.get("task") or ""}] + # The action expert is conditioned on the SUBTASK generated by + # the high-level loop (``HighLevelSubtaskFwd`` runs earlier in + # the pipeline and writes ``current_subtask``). Matches the + # training-time ``low_level_execution`` recipe — ``user(${subtask})``. + # Falls back to the task string only on the very first frame, + # before the high-level loop has produced a subtask. + subtask = state.get("current_subtask") or state.get("task") or "" + ctx = [{"role": "user", "content": subtask}] # ``add_generation_prompt=False`` to match the training-time # prefix shape: at training the action expert sees the rendered # user turn ending at ``<|im_end|>`` (no trailing @@ -744,11 +746,12 @@ def _hirobot_user_head(state: dict[str, Any]) -> str: def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]: - """``high_level_subtask`` recipe layout — predict the current subtask - from (task + plan + memory). Even when plan / memory aren't set yet - the labels render as bare ``Plan: `` / ``Memory: `` to match training. + """``high_level_subtask`` recipe layout — predict the subtask from the + task. The v-current recipe's user turn is just ``${task}`` (plan and + memory are not trained), so the inference prompt is the bare task — + no ``Plan: `` / ``Memory: `` lines. """ - return [{"role": "user", "content": _hirobot_user_head(state)}] + return [{"role": "user", "content": state.get("task") or ""}] def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]: