From 0f5f0e409170bf95ed3e61c959d01162e1d7cab1 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 16:02:15 +0200
Subject: [PATCH] refactor(recipes): rename recipes, drop pi05_hirobot

- hirobot.yaml            -> subtasks_vqa.yaml
- hirobot_memory.yaml     -> subtask_mem_vqa_speech.yaml
- pi05_hirobot.yaml       -> deleted (stale: uses plan, top-camera names;
  superseded by the two recipes above)
- smolvla2_hirobot.yaml   -> deleted (was untracked stale junk)

Updated the smolvla2 / pi052 `recipe_path` config defaults, all
docstring / comment references, the annotation-pipeline + recipe docs,
and the three tests that loaded pi05_hirobot.yaml (repointed to the
renamed recipes; the low-level-branch and pipeline-render assertions
now accept a flow-only `low_level` stream as valid supervision, since
the new recipes' low_level_execution has no text-CE target).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           |  2 +-
 docs/source/language_and_recipes.mdx          |  2 +-
 .../steerable_pipeline/modules/general_vqa.py |  2 +-
 src/lerobot/configs/recipes/pi05_hirobot.yaml | 74 -------------------
 ...emory.yaml => subtask_mem_vqa_speech.yaml} |  6 +-
 .../{hirobot.yaml => subtasks_vqa.yaml}       | 10 +--
 src/lerobot/policies/pi052/__init__.py        |  2 +-
 .../policies/pi052/configuration_pi052.py     |  2 +-
 .../smolvla2/chat_processor_smolvla2.py       |  2 +-
 .../smolvla2/configuration_smolvla2.py        |  2 +-
 .../policies/smolvla2/inference/steps.py      |  6 +-
 .../policies/smolvla2/processor_smolvla2.py   |  2 +-
 .../test_pipeline_recipe_render.py            | 22 ++++--
 tests/configs/test_recipe.py                  |  6 +-
 tests/datasets/test_language_render.py        |  9 ++-
 15 files changed, 46 insertions(+), 103 deletions(-)
 delete mode 100644 src/lerobot/configs/recipes/pi05_hirobot.yaml
 rename src/lerobot/configs/recipes/{hirobot_memory.yaml => subtask_mem_vqa_speech.yaml} (95%)
 rename src/lerobot/configs/recipes/{hirobot.yaml => subtasks_vqa.yaml} (88%)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index a3233551a..cf00dd53e 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -72,7 +72,7 @@ The executor picks `LocalPipelineExecutor` for small datasets and
 ## Style-to-recipe consumer mapping
 
 The pipeline produces exactly the styles consumed by
-`src/lerobot/configs/recipes/pi05_hirobot.yaml`:
+`src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml`:
 
 - `low_level_execution`, `high_level_subtask`, `memory_update` consume
   `subtask`/`plan`/`memory` from `language_persistent`.
diff --git a/docs/source/language_and_recipes.mdx b/docs/source/language_and_recipes.mdx
index 952b6ef09..3195be582 100644
--- a/docs/source/language_and_recipes.mdx
+++ b/docs/source/language_and_recipes.mdx
@@ -101,7 +101,7 @@ The renderer does not apply a tokenizer chat template. Policy processors decide
 ## Blends
 
 Blend recipes select one weighted sub-recipe deterministically from the sample index.
-The canonical `recipes/pi05_hirobot.yaml` combines memory updates, interjection responses, high-level subtask prediction, low-level execution, and VQA.
+`recipes/subtasks_vqa.yaml` trains the core blend — high-level subtask prediction, low-level execution, and VQA. `recipes/subtask_mem_vqa_speech.yaml` is the fuller variant that also adds memory updates and spoken interjection responses.
 
 ## Graceful absence
 
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index 56174bc34..8d32551bc 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -21,7 +21,7 @@ one ``(vqa, user)`` + ``(vqa, assistant)`` pair *per camera*: each pair is
 generated against that camera's frame and stamped with the matching
 ``camera`` field on the emitted rows. The resolver disambiguates via
 ``camera=...``; recipes that consume VQA do so through one sub-recipe
-per camera (see ``recipes/pi05_hirobot.yaml``).
+per camera (see ``recipes/subtasks_vqa.yaml``).
 
 Within a single (frame, camera) we still emit at most one ``(vqa, user)``
 and one ``(vqa, assistant)`` row, so the resolver contract stays scalar.
diff --git a/src/lerobot/configs/recipes/pi05_hirobot.yaml b/src/lerobot/configs/recipes/pi05_hirobot.yaml
deleted file mode 100644
index 7cd6b009f..000000000
--- a/src/lerobot/configs/recipes/pi05_hirobot.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-blend:
-
-  memory_update:
-    weight: 0.10
-    bindings:
-      prior_memory: "nth_prev(style=memory, offset=1)"
-      current_memory: "emitted_at(t, style=memory)"
-      completed_subtask: "nth_prev(style=subtask, offset=1)"
-    messages:
-      - {role: user, content: "${task}", stream: high_level}
-      - {role: assistant, content: "Previous memory: ${prior_memory}", stream: high_level, if_present: prior_memory}
-      - {role: user, content: "Completed subtask: ${completed_subtask}", stream: high_level, if_present: completed_subtask}
-      - {role: assistant, content: "${current_memory}", stream: high_level, target: true, if_present: current_memory}
-
-  user_interjection_response:
-    weight: 0.16
-    bindings:
-      prior_plan: "nth_prev(style=plan, offset=1)"
-      current_plan: "emitted_at(t, style=plan)"
-      interjection: "emitted_at(t, style=interjection)"
-      speech: "emitted_at(t, role=assistant, tool_name=say)"
-    messages:
-      - {role: user, content: "${task}", stream: high_level}
-      - {role: assistant, content: "Previous plan:\n${prior_plan}", stream: high_level, if_present: prior_plan}
-      - {role: user, content: "${interjection}", stream: high_level, if_present: interjection}
-      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan, tool_calls_from: speech}
-
-  high_level_subtask:
-    weight: 0.15
-    bindings:
-      next_subtask: "nth_next(style=subtask, offset=1)"
-    messages:
-      - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level}
-      - {role: user, content: "Current subtask: ${subtask}", stream: high_level, if_present: subtask}
-      - {role: assistant, content: "${next_subtask}", stream: high_level, target: true}
-
-  low_level_execution:
-    weight: 0.35
-    messages:
-      - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level}
-      - {role: assistant, content: "${subtask}", stream: low_level, target: true}
-
-  # VQA is view-dependent: bbox / keypoint / count answers only make sense for
-  # the camera they were grounded against. Each camera gets its own sub-recipe
-  # so the resolver can disambiguate via `camera=...` and the user-turn carries
-  # the matching image block. Adjust the camera keys (and add more sub-recipes)
-  # to match the cameras present on your dataset.
-  ask_vqa_top:
-    weight: 0.10
-    bindings:
-      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
-      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
-    messages:
-      - role: user
-        stream: high_level
-        if_present: vqa_query
-        content:
-          - {type: image, feature: observation.images.top}
-          - {type: text, text: "${vqa_query}"}
-      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
-
-  ask_vqa_wrist:
-    weight: 0.10
-    bindings:
-      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
-      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
-    messages:
-      - role: user
-        stream: high_level
-        if_present: vqa_query
-        content:
-          - {type: image, feature: observation.images.wrist}
-          - {type: text, text: "${vqa_query}"}
-      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
diff --git a/src/lerobot/configs/recipes/hirobot_memory.yaml b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
similarity index 95%
rename from src/lerobot/configs/recipes/hirobot_memory.yaml
rename to src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
index 2470c65cf..4081e6972 100644
--- a/src/lerobot/configs/recipes/hirobot_memory.yaml
+++ b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
@@ -1,6 +1,6 @@
-# Hi-Robot blend + memory + tool-call (spoken) responses.
+# subtask_mem_vqa_speech — Hi-Robot blend + memory + spoken responses.
 #
-# Superset of hirobot.yaml. Keeps the core subtask + action + VQA
+# Superset of subtasks_vqa.yaml. Keeps the core subtask + action + VQA
 # training, and adds two text-supervised tasks:
 #
 #   high_level_subtask         — predict the subtask from the task.
@@ -73,7 +73,7 @@ blend:
 
   # VQA is view-dependent — each camera gets its own sub-recipe so the
   # resolver disambiguates via `camera=...`. Camera keys match
-  # hirobot.yaml (`front` + `wrist`); adjust to your dataset.
+  # subtasks_vqa.yaml (`front` + `wrist`); adjust to your dataset.
   ask_vqa_top:
     weight: 0.075
     bindings:
diff --git a/src/lerobot/configs/recipes/hirobot.yaml b/src/lerobot/configs/recipes/subtasks_vqa.yaml
similarity index 88%
rename from src/lerobot/configs/recipes/hirobot.yaml
rename to src/lerobot/configs/recipes/subtasks_vqa.yaml
index 2fae907e1..1002b3b8e 100644
--- a/src/lerobot/configs/recipes/hirobot.yaml
+++ b/src/lerobot/configs/recipes/subtasks_vqa.yaml
@@ -1,10 +1,10 @@
-# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
-# PI052 (PaliGemma backbone).
+# subtasks_vqa — Hi-Robot blend, shared between SmolVLA2 (SmolVLM2
+# backbone) and PI052 (PaliGemma backbone).
 #
 #   Trains two things only: subtasks and VQA. Plan and memory are
-#   intentionally left out for now — keeps the prompt short and the
-#   training surface small while the core subtask + action loop is
-#   validated.
+#   intentionally left out — keeps the prompt short and the training
+#   surface small. The fuller blend with memory + spoken replies is
+#   ``subtask_mem_vqa_speech.yaml``.
 #
 #     high_level_subtask  — predict the subtask from the task.
 #     low_level_execution — flow loss with [images, subtask, state].
diff --git a/src/lerobot/policies/pi052/__init__.py b/src/lerobot/policies/pi052/__init__.py
index d94e17007..c726b7790 100644
--- a/src/lerobot/policies/pi052/__init__.py
+++ b/src/lerobot/policies/pi052/__init__.py
@@ -24,7 +24,7 @@ Extends :class:`lerobot.policies.pi05.PI05Policy` with:
 * per-component prompt dropout (Pi 0.7 §V.E) for regularising the
   text head against missing context at inference.
 
-See ``src/lerobot/configs/recipes/hirobot.yaml`` for the
+See ``src/lerobot/configs/recipes/subtasks_vqa.yaml`` for the
 canonical training recipe and
 ``examples/training/pi052_hirobot.slurm`` for the launcher.
 """
diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py
index b1197d787..4214baba7 100644
--- a/src/lerobot/policies/pi052/configuration_pi052.py
+++ b/src/lerobot/policies/pi052/configuration_pi052.py
@@ -55,7 +55,7 @@ class PI052Config(PI05Config):
     """
 
     # Recipe / language stack ---------------------------------------------
-    recipe_path: str | None = "recipes/hirobot.yaml"
+    recipe_path: str | None = "recipes/subtasks_vqa.yaml"
     """Path (absolute or relative to ``src/lerobot/configs/``) to a
     ``TrainingRecipe`` YAML. Defaults to the canonical Hi-Robot blend
     shipped alongside this policy. Set to ``None`` to disable recipe
diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
index 2012c895d..b628d54c5 100644
--- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
@@ -405,7 +405,7 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
         """Probabilistically drop non-target context messages.
 
         Heuristic content sniffing — matches the prefix strings that
-        ``hirobot.yaml``'s recipes use when injecting plan /
+        ``subtask_mem_vqa_speech.yaml``'s recipes use when injecting plan /
         memory / subtask / interjection content. Anything else is
         kept unchanged. Target messages are never dropped (we still
         need their tokens for supervision).
diff --git a/src/lerobot/policies/smolvla2/configuration_smolvla2.py b/src/lerobot/policies/smolvla2/configuration_smolvla2.py
index 86231e6ff..9238dd07e 100644
--- a/src/lerobot/policies/smolvla2/configuration_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/configuration_smolvla2.py
@@ -56,7 +56,7 @@ class SmolVLA2Config(SmolVLAConfig):
     """
 
     # Recipe / language stack ---------------------------------------------
-    recipe_path: str | None = "recipes/hirobot.yaml"
+    recipe_path: str | None = "recipes/subtasks_vqa.yaml"
     """Path (absolute or relative to ``src/lerobot/configs/``) to a
     ``TrainingRecipe`` YAML. The default points at the canonical Hi Robot
     blend shipped alongside SmolVLA2. Set to ``None`` to disable recipe
diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index e37604924..93417b824 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -17,7 +17,7 @@ Each step is a tiny class with a ``trigger`` and an ``__call__(state)``;
 the runtime applies them in order each tick. When a step's trigger
 doesn't fire, the step is a no-op and the runtime moves on.
 
-Stream-to-step mapping mirrors the ``hirobot.yaml`` recipe:
+Stream-to-step mapping mirrors the ``subtasks_vqa.yaml`` recipe:
 
 * ``LowLevelForward``        — calls ``policy.select_action`` for the
                                 action chunk; trained by
@@ -721,7 +721,7 @@ def _control_context_messages(
 ) -> list[dict[str, Any]]:
     """Build a chat-template-ready prompt from current runtime state.
 
-    Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan:
+    Mirrors what ``subtasks_vqa.yaml`` renders into ``${task}\nPlan:
     ${plan}\nMemory: ${memory}`` for the high-level branches.
     """
     # Always emit ``Plan: `` / ``Memory: `` labels — even with empty
@@ -741,7 +741,7 @@ def _control_context_messages(
 
 # ---------------------------------------------------------------------------
 # Per-recipe prompt builders. Each one mirrors a single sub-recipe's
-# message layout in ``hirobot.yaml`` so the chat-templated
+# message layout in ``subtasks_vqa.yaml`` so the chat-templated
 # prompt at inference matches what the model saw during training.
 # Generic ``_control_context_messages`` is kept around as a fallback
 # for ad-hoc callers but the four high-level steps now use these.
diff --git a/src/lerobot/policies/smolvla2/processor_smolvla2.py b/src/lerobot/policies/smolvla2/processor_smolvla2.py
index a76608502..9d0913e0b 100644
--- a/src/lerobot/policies/smolvla2/processor_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/processor_smolvla2.py
@@ -121,7 +121,7 @@ def _load_recipe(path_str: str) -> TrainingRecipe:
 
     Accepts an absolute path or a path relative to
     ``src/lerobot/configs/`` so recipe authors can write
-    ``--policy.recipe_path=recipes/hirobot.yaml``.
+    ``--policy.recipe_path=recipes/subtasks_vqa.yaml``.
     """
     p = Path(path_str)
     if not p.is_absolute() and not p.exists():
diff --git a/tests/annotations/test_pipeline_recipe_render.py b/tests/annotations/test_pipeline_recipe_render.py
index 3cbd92358..fccda6bd6 100644
--- a/tests/annotations/test_pipeline_recipe_render.py
+++ b/tests/annotations/test_pipeline_recipe_render.py
@@ -41,7 +41,12 @@ from lerobot.datasets.language_render import render_sample
 from ._helpers import make_canned_responder
 
 _RECIPE_PATH = (
-    Path(__file__).resolve().parents[2] / "src" / "lerobot" / "configs" / "recipes" / "pi05_hirobot.yaml"
+    Path(__file__).resolve().parents[2]
+    / "src"
+    / "lerobot"
+    / "configs"
+    / "recipes"
+    / "subtask_mem_vqa_speech.yaml"
 )
 
 
@@ -105,22 +110,29 @@ def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output(
         recipe = TrainingRecipe(**loaded)
 
     rendered_any = False
-    for ts, persistent, events in zip(timestamps, persistent_lists, events_lists, strict=True):
+    for sample_idx, (ts, persistent, events) in enumerate(
+        zip(timestamps, persistent_lists, events_lists, strict=True)
+    ):
         result = render_sample(
             recipe=recipe,
             persistent=persistent,
             events=events,
             t=float(ts),
-            sample_idx=0,
+            sample_idx=sample_idx,
             dataset_ctx={"task": "Pour water from the bottle into the cup."},
         )
         if result is None:
             continue
         if result["messages"]:
             rendered_any = True
-            assert result["target_message_indices"]
+            # A valid render supervises something: a text-CE target turn
+            # OR a flow-only ``low_level``-stream turn (action loss).
+            assert (
+                result["target_message_indices"]
+                or "low_level" in result["message_streams"]
+            )
             break
-    assert rendered_any, "PR 1 recipe rendered no messages from pipeline output"
+    assert rendered_any, "recipe rendered no messages from pipeline output"
 
     # Sanity: speech atom appears in events column intact
     flat_events = [r for ev in events_lists for r in ev]
diff --git a/tests/configs/test_recipe.py b/tests/configs/test_recipe.py
index bd71d540f..462af14d7 100644
--- a/tests/configs/test_recipe.py
+++ b/tests/configs/test_recipe.py
@@ -18,7 +18,9 @@ def test_message_recipe_validates_unknown_binding():
 
 
 def test_canonical_recipe_loads():
-    recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/pi05_hirobot.yaml"))
+    recipe = TrainingRecipe.from_yaml(
+        Path("src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml")
+    )
 
     assert recipe.blend is not None
     assert set(recipe.blend) == {
@@ -29,4 +31,4 @@ def test_canonical_recipe_loads():
         "ask_vqa_top",
         "ask_vqa_wrist",
     }
-    assert sum(component.weight for component in recipe.blend.values()) == pytest.approx(0.96)
+    assert sum(component.weight for component in recipe.blend.values()) == pytest.approx(1.0)
diff --git a/tests/datasets/test_language_render.py b/tests/datasets/test_language_render.py
index f8bd7ce4f..b196e4e59 100644
--- a/tests/datasets/test_language_render.py
+++ b/tests/datasets/test_language_render.py
@@ -449,7 +449,10 @@ def test_vqa_frame_is_consumed_over_the_weighted_blend():
 
 
 def test_canonical_recipe_can_render_low_level_branch():
-    recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/pi05_hirobot.yaml"))
+    """The shipped ``subtasks_vqa.yaml`` recipe's ``low_level_execution``
+    branch renders — a flow-only ``user(${subtask})`` turn (no text-CE
+    target; its supervision is the action-expert flow loss)."""
+    recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/subtasks_vqa.yaml"))
     low_level = TrainingRecipe(blend={"low": recipe.blend["low_level_execution"]})
 
     rendered = render_sample(
@@ -461,6 +464,6 @@ def test_canonical_recipe_can_render_low_level_branch():
         task="clean kitchen",
     )
 
-    assert rendered["messages"][-1] == {"role": "assistant", "content": "subtask 0"}
+    assert rendered["messages"][-1] == {"role": "user", "content": "subtask 0"}
     assert rendered["message_streams"][-1] == "low_level"
-    assert rendered["target_message_indices"] == [1]
+    assert rendered["target_message_indices"] == []