From e3ad1c59fc69ce29fd4c4903273b9e79929229ee Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 13 May 2026 13:51:37 +0200
Subject: [PATCH] feat(recipes): add plan_generation sub-recipe to smolvla2 +
 pi052 blends
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New text-only sub-recipe at 0.10 weight on both blends:

    user      :  ${task}
    assistant :  ${current_plan}   (high_level target)

Bound to ``active_at(t, style=plan)`` so it supervises the
currently-active plan on every frame, gated by ``if_present`` to
skip frames without a plan annotation.

Weights rebalanced: action_execution 0.85 → 0.75, plan_generation
0.10, VQA top/wrist 0.075 each (sums to 1.0).

Added matching runtime builder ``_msgs_for_plan`` in
``smolvla2/inference/steps.py`` so the high-level loop can call
``select_message`` with the bare-task prompt at episode start /
replanning events.

Closes a gap vs. Pi 0.7 §V — without this recipe the model could
read ``${plan}`` from the prompt but never had to produce one.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/configs/recipes/pi052_hirobot.yaml  | 10 +++++++++-
 .../configs/recipes/smolvla2_hirobot.yaml       | 17 ++++++++++++++---
 .../policies/smolvla2/inference/steps.py        | 10 ++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml
index c2f80da71..b56d04334 100644
--- a/src/lerobot/configs/recipes/pi052_hirobot.yaml
+++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml
@@ -9,7 +9,7 @@
 blend:
 
   action_execution:
-    weight: 0.85
+    weight: 0.75
     bindings:
       new_memory: "emitted_at(t, style=memory)"
     messages:
@@ -21,6 +21,14 @@ blend:
       # new memory as a second assistant turn (same forward pass).
       - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
 
+  plan_generation:
+    weight: 0.10
+    bindings:
+      current_plan: "active_at(t, style=plan)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
+
   ask_vqa_top:
     weight: 0.075
     bindings:
diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
index 8579d9622..6da9e362b 100644
--- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
+++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
@@ -1,15 +1,18 @@
-# SmolVLA2 Hi-Robot blend — two flavors:
+# SmolVLA2 Hi-Robot blend — three flavors:
 #
 #   1. action_execution  — fused (task + plan + memory) prompt;
 #      supervises the current subtask (low_level: flow + text CE)
 #      and, at memory-boundary frames, the new memory too.
-#   2. ask_vqa_{top,wrist} — text-only VQA on a camera image,
+#   2. plan_generation   — task → plan (text only). Trains the
+#      model to produce a plan from a bare task description so
+#      the runtime can call it at episode start / replanning.
+#   3. ask_vqa_{top,wrist} — text-only VQA on a camera image,
 #      gated by ``if_present`` so they only fire on annotated frames.
 
 blend:
 
   action_execution:
-    weight: 0.85
+    weight: 0.75
     bindings:
       new_memory: "emitted_at(t, style=memory)"
     messages:
@@ -21,6 +24,14 @@ blend:
       # new memory as a second assistant turn (same forward pass).
       - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
 
+  plan_generation:
+    weight: 0.10
+    bindings:
+      current_plan: "active_at(t, style=plan)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
+
   ask_vqa_top:
     weight: 0.075
     bindings:
diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index e49ef3355..878338e17 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -780,6 +780,16 @@ def _msgs_for_interjection(state: dict[str, Any]) -> list[dict[str, Any]]:
     return msgs
 
 
+def _msgs_for_plan(state: dict[str, Any]) -> list[dict[str, Any]]:
+    """``plan_generation`` recipe layout — bare task → plan.
+
+    The assistant turn is the generation target, so we only render
+    the user turn at inference; the runtime appends the predicted
+    plan after sampling.
+    """
+    return [{"role": "user", "content": state.get("task") or ""}]
+
+
 def _msgs_for_vqa(question: str) -> list[dict[str, Any]]:
     """``ask_vqa_*`` recipe layout (text-only at inference)."""
     return [{"role": "user", "content": question}]