From e3ad1c59fc69ce29fd4c4903273b9e79929229ee Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 13:51:37 +0200 Subject: [PATCH] feat(recipes): add plan_generation sub-recipe to smolvla2 + pi052 blends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New text-only sub-recipe at 0.10 weight on both blends: user : ${task} assistant : ${current_plan} (high_level target) Bound to ``active_at(t, style=plan)`` so it supervises the currently-active plan on every frame, gated by ``if_present`` to skip frames without a plan annotation. Weights rebalanced: action_execution 0.85 → 0.75, plan_generation 0.10, VQA top/wrist 0.075 each (sums to 1.0). Added matching runtime builder ``_msgs_for_plan`` in ``smolvla2/inference/steps.py`` so the high-level loop can call ``select_message`` with the bare-task prompt at episode start / replanning events. Closes a gap vs. Pi 0.7 §V — without this recipe the model could read ``${plan}`` from the prompt but never had to produce one. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/configs/recipes/pi052_hirobot.yaml | 10 +++++++++- .../configs/recipes/smolvla2_hirobot.yaml | 17 ++++++++++++++--- .../policies/smolvla2/inference/steps.py | 10 ++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml index c2f80da71..b56d04334 100644 --- a/src/lerobot/configs/recipes/pi052_hirobot.yaml +++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml @@ -9,7 +9,7 @@ blend: action_execution: - weight: 0.85 + weight: 0.75 bindings: new_memory: "emitted_at(t, style=memory)" messages: @@ -21,6 +21,14 @@ blend: # new memory as a second assistant turn (same forward pass). - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} + plan_generation: + weight: 0.10 + bindings: + current_plan: "active_at(t, style=plan)" + messages: + - {role: user, content: "${task}", stream: high_level} + - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan} + ask_vqa_top: weight: 0.075 bindings: diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml index 8579d9622..6da9e362b 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml @@ -1,15 +1,18 @@ -# SmolVLA2 Hi-Robot blend — two flavors: +# SmolVLA2 Hi-Robot blend — three flavors: # # 1. action_execution — fused (task + plan + memory) prompt; # supervises the current subtask (low_level: flow + text CE) # and, at memory-boundary frames, the new memory too. -# 2. ask_vqa_{top,wrist} — text-only VQA on a camera image, +# 2. plan_generation — task → plan (text only). Trains the +# model to produce a plan from a bare task description so +# the runtime can call it at episode start / replanning. +# 3. ask_vqa_{top,wrist} — text-only VQA on a camera image, # gated by ``if_present`` so they only fire on annotated frames. blend: action_execution: - weight: 0.85 + weight: 0.75 bindings: new_memory: "emitted_at(t, style=memory)" messages: @@ -21,6 +24,14 @@ blend: # new memory as a second assistant turn (same forward pass). - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} + plan_generation: + weight: 0.10 + bindings: + current_plan: "active_at(t, style=plan)" + messages: + - {role: user, content: "${task}", stream: high_level} + - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan} + ask_vqa_top: weight: 0.075 bindings: diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index e49ef3355..878338e17 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -780,6 +780,16 @@ def _msgs_for_interjection(state: dict[str, Any]) -> list[dict[str, Any]]: return msgs +def _msgs_for_plan(state: dict[str, Any]) -> list[dict[str, Any]]: + """``plan_generation`` recipe layout — bare task → plan. + + The assistant turn is the generation target, so we only render + the user turn at inference; the runtime appends the predicted + plan after sampling. + """ + return [{"role": "user", "content": state.get("task") or ""}] + + def _msgs_for_vqa(question: str) -> list[dict[str, Any]]: """``ask_vqa_*`` recipe layout (text-only at inference).""" return [{"role": "user", "content": question}]