From 0f5f0e409170bf95ed3e61c959d01162e1d7cab1 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 16:02:15 +0200 Subject: [PATCH] refactor(recipes): rename recipes, drop pi05_hirobot - hirobot.yaml -> subtasks_vqa.yaml - hirobot_memory.yaml -> subtask_mem_vqa_speech.yaml - pi05_hirobot.yaml -> deleted (stale: uses plan, top-camera names; superseded by the two recipes above) - smolvla2_hirobot.yaml -> deleted (was untracked stale junk) Updated the smolvla2 / pi052 `recipe_path` config defaults, all docstring / comment references, the annotation-pipeline + recipe docs, and the three tests that loaded pi05_hirobot.yaml (repointed to the renamed recipes; the low-level-branch and pipeline-render assertions now accept a flow-only `low_level` stream as valid supervision, since the new recipes' low_level_execution has no text-CE target). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/source/annotation_pipeline.mdx | 2 +- docs/source/language_and_recipes.mdx | 2 +- .../steerable_pipeline/modules/general_vqa.py | 2 +- src/lerobot/configs/recipes/pi05_hirobot.yaml | 74 ------------------- ...emory.yaml => subtask_mem_vqa_speech.yaml} | 6 +- .../{hirobot.yaml => subtasks_vqa.yaml} | 10 +-- src/lerobot/policies/pi052/__init__.py | 2 +- .../policies/pi052/configuration_pi052.py | 2 +- .../smolvla2/chat_processor_smolvla2.py | 2 +- .../smolvla2/configuration_smolvla2.py | 2 +- .../policies/smolvla2/inference/steps.py | 6 +- .../policies/smolvla2/processor_smolvla2.py | 2 +- .../test_pipeline_recipe_render.py | 22 ++++-- tests/configs/test_recipe.py | 6 +- tests/datasets/test_language_render.py | 9 ++- 15 files changed, 46 insertions(+), 103 deletions(-) delete mode 100644 src/lerobot/configs/recipes/pi05_hirobot.yaml rename src/lerobot/configs/recipes/{hirobot_memory.yaml => subtask_mem_vqa_speech.yaml} (95%) rename src/lerobot/configs/recipes/{hirobot.yaml => subtasks_vqa.yaml} (88%) diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx index a3233551a..cf00dd53e 100644 --- a/docs/source/annotation_pipeline.mdx +++ b/docs/source/annotation_pipeline.mdx @@ -72,7 +72,7 @@ The executor picks `LocalPipelineExecutor` for small datasets and ## Style-to-recipe consumer mapping The pipeline produces exactly the styles consumed by -`src/lerobot/configs/recipes/pi05_hirobot.yaml`: +`src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml`: - `low_level_execution`, `high_level_subtask`, `memory_update` consume `subtask`/`plan`/`memory` from `language_persistent`. diff --git a/docs/source/language_and_recipes.mdx b/docs/source/language_and_recipes.mdx index 952b6ef09..3195be582 100644 --- a/docs/source/language_and_recipes.mdx +++ b/docs/source/language_and_recipes.mdx @@ -101,7 +101,7 @@ The renderer does not apply a tokenizer chat template. Policy processors decide ## Blends Blend recipes select one weighted sub-recipe deterministically from the sample index. -The canonical `recipes/pi05_hirobot.yaml` combines memory updates, interjection responses, high-level subtask prediction, low-level execution, and VQA. +`recipes/subtasks_vqa.yaml` trains the core blend — high-level subtask prediction, low-level execution, and VQA. `recipes/subtask_mem_vqa_speech.yaml` is the fuller variant that also adds memory updates and spoken interjection responses. ## Graceful absence diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py index 56174bc34..8d32551bc 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py @@ -21,7 +21,7 @@ one ``(vqa, user)`` + ``(vqa, assistant)`` pair *per camera*: each pair is generated against that camera's frame and stamped with the matching ``camera`` field on the emitted rows. The resolver disambiguates via ``camera=...``; recipes that consume VQA do so through one sub-recipe -per camera (see ``recipes/pi05_hirobot.yaml``). +per camera (see ``recipes/subtasks_vqa.yaml``). Within a single (frame, camera) we still emit at most one ``(vqa, user)`` and one ``(vqa, assistant)`` row, so the resolver contract stays scalar. diff --git a/src/lerobot/configs/recipes/pi05_hirobot.yaml b/src/lerobot/configs/recipes/pi05_hirobot.yaml deleted file mode 100644 index 7cd6b009f..000000000 --- a/src/lerobot/configs/recipes/pi05_hirobot.yaml +++ /dev/null @@ -1,74 +0,0 @@ -blend: - - memory_update: - weight: 0.10 - bindings: - prior_memory: "nth_prev(style=memory, offset=1)" - current_memory: "emitted_at(t, style=memory)" - completed_subtask: "nth_prev(style=subtask, offset=1)" - messages: - - {role: user, content: "${task}", stream: high_level} - - {role: assistant, content: "Previous memory: ${prior_memory}", stream: high_level, if_present: prior_memory} - - {role: user, content: "Completed subtask: ${completed_subtask}", stream: high_level, if_present: completed_subtask} - - {role: assistant, content: "${current_memory}", stream: high_level, target: true, if_present: current_memory} - - user_interjection_response: - weight: 0.16 - bindings: - prior_plan: "nth_prev(style=plan, offset=1)" - current_plan: "emitted_at(t, style=plan)" - interjection: "emitted_at(t, style=interjection)" - speech: "emitted_at(t, role=assistant, tool_name=say)" - messages: - - {role: user, content: "${task}", stream: high_level} - - {role: assistant, content: "Previous plan:\n${prior_plan}", stream: high_level, if_present: prior_plan} - - {role: user, content: "${interjection}", stream: high_level, if_present: interjection} - - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan, tool_calls_from: speech} - - high_level_subtask: - weight: 0.15 - bindings: - next_subtask: "nth_next(style=subtask, offset=1)" - messages: - - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - - {role: user, content: "Current subtask: ${subtask}", stream: high_level, if_present: subtask} - - {role: assistant, content: "${next_subtask}", stream: high_level, target: true} - - low_level_execution: - weight: 0.35 - messages: - - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - - {role: assistant, content: "${subtask}", stream: low_level, target: true} - - # VQA is view-dependent: bbox / keypoint / count answers only make sense for - # the camera they were grounded against. Each camera gets its own sub-recipe - # so the resolver can disambiguate via `camera=...` and the user-turn carries - # the matching image block. Adjust the camera keys (and add more sub-recipes) - # to match the cameras present on your dataset. - ask_vqa_top: - weight: 0.10 - bindings: - vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)" - vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)" - messages: - - role: user - stream: high_level - if_present: vqa_query - content: - - {type: image, feature: observation.images.top} - - {type: text, text: "${vqa_query}"} - - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} - - ask_vqa_wrist: - weight: 0.10 - bindings: - vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)" - vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)" - messages: - - role: user - stream: high_level - if_present: vqa_query - content: - - {type: image, feature: observation.images.wrist} - - {type: text, text: "${vqa_query}"} - - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} diff --git a/src/lerobot/configs/recipes/hirobot_memory.yaml b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml similarity index 95% rename from src/lerobot/configs/recipes/hirobot_memory.yaml rename to src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml index 2470c65cf..4081e6972 100644 --- a/src/lerobot/configs/recipes/hirobot_memory.yaml +++ b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml @@ -1,6 +1,6 @@ -# Hi-Robot blend + memory + tool-call (spoken) responses. +# subtask_mem_vqa_speech — Hi-Robot blend + memory + spoken responses. # -# Superset of hirobot.yaml. Keeps the core subtask + action + VQA +# Superset of subtasks_vqa.yaml. Keeps the core subtask + action + VQA # training, and adds two text-supervised tasks: # # high_level_subtask — predict the subtask from the task. @@ -73,7 +73,7 @@ blend: # VQA is view-dependent — each camera gets its own sub-recipe so the # resolver disambiguates via `camera=...`. Camera keys match - # hirobot.yaml (`front` + `wrist`); adjust to your dataset. + # subtasks_vqa.yaml (`front` + `wrist`); adjust to your dataset. ask_vqa_top: weight: 0.075 bindings: diff --git a/src/lerobot/configs/recipes/hirobot.yaml b/src/lerobot/configs/recipes/subtasks_vqa.yaml similarity index 88% rename from src/lerobot/configs/recipes/hirobot.yaml rename to src/lerobot/configs/recipes/subtasks_vqa.yaml index 2fae907e1..1002b3b8e 100644 --- a/src/lerobot/configs/recipes/hirobot.yaml +++ b/src/lerobot/configs/recipes/subtasks_vqa.yaml @@ -1,10 +1,10 @@ -# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and -# PI052 (PaliGemma backbone). +# subtasks_vqa — Hi-Robot blend, shared between SmolVLA2 (SmolVLM2 +# backbone) and PI052 (PaliGemma backbone). # # Trains two things only: subtasks and VQA. Plan and memory are -# intentionally left out for now — keeps the prompt short and the -# training surface small while the core subtask + action loop is -# validated. +# intentionally left out — keeps the prompt short and the training +# surface small. The fuller blend with memory + spoken replies is +# ``subtask_mem_vqa_speech.yaml``. # # high_level_subtask — predict the subtask from the task. # low_level_execution — flow loss with [images, subtask, state]. diff --git a/src/lerobot/policies/pi052/__init__.py b/src/lerobot/policies/pi052/__init__.py index d94e17007..c726b7790 100644 --- a/src/lerobot/policies/pi052/__init__.py +++ b/src/lerobot/policies/pi052/__init__.py @@ -24,7 +24,7 @@ Extends :class:`lerobot.policies.pi05.PI05Policy` with: * per-component prompt dropout (Pi 0.7 §V.E) for regularising the text head against missing context at inference. -See ``src/lerobot/configs/recipes/hirobot.yaml`` for the +See ``src/lerobot/configs/recipes/subtasks_vqa.yaml`` for the canonical training recipe and ``examples/training/pi052_hirobot.slurm`` for the launcher. """ diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py index b1197d787..4214baba7 100644 --- a/src/lerobot/policies/pi052/configuration_pi052.py +++ b/src/lerobot/policies/pi052/configuration_pi052.py @@ -55,7 +55,7 @@ class PI052Config(PI05Config): """ # Recipe / language stack --------------------------------------------- - recipe_path: str | None = "recipes/hirobot.yaml" + recipe_path: str | None = "recipes/subtasks_vqa.yaml" """Path (absolute or relative to ``src/lerobot/configs/``) to a ``TrainingRecipe`` YAML. Defaults to the canonical Hi-Robot blend shipped alongside this policy. Set to ``None`` to disable recipe diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py index 2012c895d..b628d54c5 100644 --- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py @@ -405,7 +405,7 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep): """Probabilistically drop non-target context messages. Heuristic content sniffing — matches the prefix strings that - ``hirobot.yaml``'s recipes use when injecting plan / + ``subtask_mem_vqa_speech.yaml``'s recipes use when injecting plan / memory / subtask / interjection content. Anything else is kept unchanged. Target messages are never dropped (we still need their tokens for supervision). diff --git a/src/lerobot/policies/smolvla2/configuration_smolvla2.py b/src/lerobot/policies/smolvla2/configuration_smolvla2.py index 86231e6ff..9238dd07e 100644 --- a/src/lerobot/policies/smolvla2/configuration_smolvla2.py +++ b/src/lerobot/policies/smolvla2/configuration_smolvla2.py @@ -56,7 +56,7 @@ class SmolVLA2Config(SmolVLAConfig): """ # Recipe / language stack --------------------------------------------- - recipe_path: str | None = "recipes/hirobot.yaml" + recipe_path: str | None = "recipes/subtasks_vqa.yaml" """Path (absolute or relative to ``src/lerobot/configs/``) to a ``TrainingRecipe`` YAML. The default points at the canonical Hi Robot blend shipped alongside SmolVLA2. Set to ``None`` to disable recipe diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index e37604924..93417b824 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -17,7 +17,7 @@ Each step is a tiny class with a ``trigger`` and an ``__call__(state)``; the runtime applies them in order each tick. When a step's trigger doesn't fire, the step is a no-op and the runtime moves on. -Stream-to-step mapping mirrors the ``hirobot.yaml`` recipe: +Stream-to-step mapping mirrors the ``subtasks_vqa.yaml`` recipe: * ``LowLevelForward`` — calls ``policy.select_action`` for the action chunk; trained by @@ -721,7 +721,7 @@ def _control_context_messages( ) -> list[dict[str, Any]]: """Build a chat-template-ready prompt from current runtime state. - Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan: + Mirrors what ``subtasks_vqa.yaml`` renders into ``${task}\nPlan: ${plan}\nMemory: ${memory}`` for the high-level branches. """ # Always emit ``Plan: `` / ``Memory: `` labels — even with empty @@ -741,7 +741,7 @@ def _control_context_messages( # --------------------------------------------------------------------------- # Per-recipe prompt builders. Each one mirrors a single sub-recipe's -# message layout in ``hirobot.yaml`` so the chat-templated +# message layout in ``subtasks_vqa.yaml`` so the chat-templated # prompt at inference matches what the model saw during training. # Generic ``_control_context_messages`` is kept around as a fallback # for ad-hoc callers but the four high-level steps now use these. diff --git a/src/lerobot/policies/smolvla2/processor_smolvla2.py b/src/lerobot/policies/smolvla2/processor_smolvla2.py index a76608502..9d0913e0b 100644 --- a/src/lerobot/policies/smolvla2/processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/processor_smolvla2.py @@ -121,7 +121,7 @@ def _load_recipe(path_str: str) -> TrainingRecipe: Accepts an absolute path or a path relative to ``src/lerobot/configs/`` so recipe authors can write - ``--policy.recipe_path=recipes/hirobot.yaml``. + ``--policy.recipe_path=recipes/subtasks_vqa.yaml``. """ p = Path(path_str) if not p.is_absolute() and not p.exists(): diff --git a/tests/annotations/test_pipeline_recipe_render.py b/tests/annotations/test_pipeline_recipe_render.py index 3cbd92358..fccda6bd6 100644 --- a/tests/annotations/test_pipeline_recipe_render.py +++ b/tests/annotations/test_pipeline_recipe_render.py @@ -41,7 +41,12 @@ from lerobot.datasets.language_render import render_sample from ._helpers import make_canned_responder _RECIPE_PATH = ( - Path(__file__).resolve().parents[2] / "src" / "lerobot" / "configs" / "recipes" / "pi05_hirobot.yaml" + Path(__file__).resolve().parents[2] + / "src" + / "lerobot" + / "configs" + / "recipes" + / "subtask_mem_vqa_speech.yaml" ) @@ -105,22 +110,29 @@ def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output( recipe = TrainingRecipe(**loaded) rendered_any = False - for ts, persistent, events in zip(timestamps, persistent_lists, events_lists, strict=True): + for sample_idx, (ts, persistent, events) in enumerate( + zip(timestamps, persistent_lists, events_lists, strict=True) + ): result = render_sample( recipe=recipe, persistent=persistent, events=events, t=float(ts), - sample_idx=0, + sample_idx=sample_idx, dataset_ctx={"task": "Pour water from the bottle into the cup."}, ) if result is None: continue if result["messages"]: rendered_any = True - assert result["target_message_indices"] + # A valid render supervises something: a text-CE target turn + # OR a flow-only ``low_level``-stream turn (action loss). + assert ( + result["target_message_indices"] + or "low_level" in result["message_streams"] + ) break - assert rendered_any, "PR 1 recipe rendered no messages from pipeline output" + assert rendered_any, "recipe rendered no messages from pipeline output" # Sanity: speech atom appears in events column intact flat_events = [r for ev in events_lists for r in ev] diff --git a/tests/configs/test_recipe.py b/tests/configs/test_recipe.py index bd71d540f..462af14d7 100644 --- a/tests/configs/test_recipe.py +++ b/tests/configs/test_recipe.py @@ -18,7 +18,9 @@ def test_message_recipe_validates_unknown_binding(): def test_canonical_recipe_loads(): - recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/pi05_hirobot.yaml")) + recipe = TrainingRecipe.from_yaml( + Path("src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml") + ) assert recipe.blend is not None assert set(recipe.blend) == { @@ -29,4 +31,4 @@ def test_canonical_recipe_loads(): "ask_vqa_top", "ask_vqa_wrist", } - assert sum(component.weight for component in recipe.blend.values()) == pytest.approx(0.96) + assert sum(component.weight for component in recipe.blend.values()) == pytest.approx(1.0) diff --git a/tests/datasets/test_language_render.py b/tests/datasets/test_language_render.py index f8bd7ce4f..b196e4e59 100644 --- a/tests/datasets/test_language_render.py +++ b/tests/datasets/test_language_render.py @@ -449,7 +449,10 @@ def test_vqa_frame_is_consumed_over_the_weighted_blend(): def test_canonical_recipe_can_render_low_level_branch(): - recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/pi05_hirobot.yaml")) + """The shipped ``subtasks_vqa.yaml`` recipe's ``low_level_execution`` + branch renders — a flow-only ``user(${subtask})`` turn (no text-CE + target; its supervision is the action-expert flow loss).""" + recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/subtasks_vqa.yaml")) low_level = TrainingRecipe(blend={"low": recipe.blend["low_level_execution"]}) rendered = render_sample( @@ -461,6 +464,6 @@ def test_canonical_recipe_can_render_low_level_branch(): task="clean kitchen", ) - assert rendered["messages"][-1] == {"role": "assistant", "content": "subtask 0"} + assert rendered["messages"][-1] == {"role": "user", "content": "subtask 0"} assert rendered["message_streams"][-1] == "low_level" - assert rendered["target_message_indices"] == [1] + assert rendered["target_message_indices"] == []