diff --git a/src/lerobot/configs/recipes/hirobot.yaml b/src/lerobot/configs/recipes/hirobot.yaml index 8eb21cc3c..3cf5bc573 100644 --- a/src/lerobot/configs/recipes/hirobot.yaml +++ b/src/lerobot/configs/recipes/hirobot.yaml @@ -1,14 +1,13 @@ # Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and -# PI052 (PaliGemma backbone). π0.5-style split: +# PI052 (PaliGemma backbone). # -# The action expert is conditioned on (images, state, subtask) only. -# Hierarchical context (task + plan + memory) only flows into the -# high-level text head. +# Trains two things only: subtasks and VQA. Plan and memory are +# intentionally left out for now — keeps the prompt short and the +# training surface small while the core subtask + action loop is +# validated. # -# high_level_subtask — predict subtask from (task+plan+memory), -# and the new memory at boundary frames. +# high_level_subtask — predict the subtask from the task. # low_level_execution — flow loss with [images, subtask, state]. -# plan_generation — task → plan. # ask_vqa_{top,wrist} — camera-grounded VQA. # # Each backbone's text tokenizer renders these messages differently @@ -18,20 +17,13 @@ blend: high_level_subtask: - weight: 0.50 - bindings: - new_memory: "emitted_at(t, style=memory)" + weight: 0.40 messages: - - role: user - stream: high_level - content: "${task}\nPlan: ${plan}\nMemory: ${memory}" + - {role: user, content: "${task}", stream: high_level} - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} - # Boundary-frame tail: at a subtask transition, also predict - # the new memory in the same forward pass. - - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} low_level_execution: - weight: 0.30 + weight: 0.40 messages: # π0.5-style action conditioning. The action expert sees only # [images, this user turn (= bare subtask), state]. No text-CE @@ -40,16 +32,8 @@ blend: # flow loss fires. - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} - plan_generation: - weight: 0.10 - bindings: - current_plan: "active_at(t, style=plan)" - messages: - - {role: user, content: "${task}", stream: high_level} - - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan} - ask_vqa_top: - weight: 0.05 + weight: 0.10 bindings: vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)" vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)" @@ -63,7 +47,7 @@ blend: - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} ask_vqa_wrist: - weight: 0.05 + weight: 0.10 bindings: vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)" vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)" diff --git a/src/lerobot/policies/smolvla2/inference/runtime.py b/src/lerobot/policies/smolvla2/inference/runtime.py index 3d76015ab..24b32a940 100644 --- a/src/lerobot/policies/smolvla2/inference/runtime.py +++ b/src/lerobot/policies/smolvla2/inference/runtime.py @@ -30,11 +30,8 @@ from .steps import ( AskVQAFwd, DispatchAction, DispatchToolCalls, - HighLevelSubtaskFwd, InferenceStep, LowLevelForward, - MemoryUpdateFwd, - UserInterjectionFwd, ) from .triggers import HzTrigger, TickClock @@ -69,31 +66,24 @@ class SmolVLA2Runtime: _stop: bool = field(default=False, init=False) def __post_init__(self) -> None: - # Pipeline order matters. Both ``HighLevelSubtaskFwd`` and - # ``LowLevelForward`` are gated on "action queue is empty" so - # the slow LLM call (select_message) doesn't starve dispatch. - # If LowLevelForward runs first, it refills the queue and the - # high-level step never sees ``queue == 0`` afterwards. + # VQA-only configuration (current scope). The training recipe + # supervises only subtasks + VQA — plan and memory are out for + # now — so the runtime drops the high-level subtask / + # memory-update / interjection steps. The remaining loop is: # - # Order is therefore: high-level steps that read state (subtask, - # memory, interjection, vqa) → low-level chunk refresh → action - # dispatch → tool dispatch. So on an empty-queue tick the - # subtask refreshes first, the new subtask string flows into - # the next chunk's prompt, and DispatchAction drains. + # AskVQAFwd → answer camera-grounded questions on stdin + # LowLevelForward → action chunk (conditioned on the task + # string directly, since no subtask is + # being generated — see LowLevelForward's + # ``current_subtask or task`` fallback) + # DispatchAction → drain the chunk to the robot + # DispatchToolCalls → fire any pending tool calls + # + # ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` / + # ``UserInterjectionFwd`` are still importable from + # ``inference.steps`` — re-add them here once plan / memory / + # subtask generation is back in scope. self.pipeline = [ - HighLevelSubtaskFwd( - trigger=HzTrigger(self.high_level_hz), - policy=self.policy, - observation_provider=self.observation_provider, - ), - MemoryUpdateFwd( - policy=self.policy, - observation_provider=self.observation_provider, - ), - UserInterjectionFwd( - policy=self.policy, - observation_provider=self.observation_provider, - ), AskVQAFwd( policy=self.policy, observation_provider=self.observation_provider,