feat(recipes): add plan_generation sub-recipe to smolvla2 + pi052 blends

New text-only sub-recipe at 0.10 weight on both blends: user : ${task} assistant : ${current_plan} (high_level target) Bound to ``active_at(t, style=plan)`` so it supervises the currently-active plan on every frame, gated by ``if_present`` to skip frames without a plan annotation. Weights rebalanced: action_execution 0.85 → 0.75, plan_generation 0.10, VQA top/wrist 0.075 each (sums to 1.0). Added matching runtime builder ``_msgs_for_plan`` in ``smolvla2/inference/steps.py`` so the high-level loop can call ``select_message`` with the bare-task prompt at episode start / replanning events. Closes a gap vs. Pi 0.7 §V — without this recipe the model could read ``${plan}`` from the prompt but never had to produce one. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 17:20:05 +00:00 · 2026-05-13 13:51:37 +02:00
parent 9ff62cb08c
commit e3ad1c59fc
3 changed files with 33 additions and 4 deletions
@@ -9,7 +9,7 @@
 blend:

  action_execution:
-    weight: 0.85
+    weight: 0.75
    bindings:
      new_memory: "emitted_at(t, style=memory)"
    messages:
@@ -21,6 +21,14 @@ blend:
      # new memory as a second assistant turn (same forward pass).
      - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}

+  plan_generation:
+    weight: 0.10
+    bindings:
+      current_plan: "active_at(t, style=plan)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
+
  ask_vqa_top:
    weight: 0.075
    bindings:
@@ -1,15 +1,18 @@
-# SmolVLA2 Hi-Robot blend — two flavors:
+# SmolVLA2 Hi-Robot blend — three flavors:
 #
 #   1. action_execution  — fused (task + plan + memory) prompt;
 #      supervises the current subtask (low_level: flow + text CE)
 #      and, at memory-boundary frames, the new memory too.
-#   2. ask_vqa_{top,wrist} — text-only VQA on a camera image,
+#   2. plan_generation   — task → plan (text only). Trains the
+#      model to produce a plan from a bare task description so
+#      the runtime can call it at episode start / replanning.
+#   3. ask_vqa_{top,wrist} — text-only VQA on a camera image,
 #      gated by ``if_present`` so they only fire on annotated frames.

 blend:

  action_execution:
-    weight: 0.85
+    weight: 0.75
    bindings:
      new_memory: "emitted_at(t, style=memory)"
    messages:
@@ -21,6 +24,14 @@ blend:
      # new memory as a second assistant turn (same forward pass).
      - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}

+  plan_generation:
+    weight: 0.10
+    bindings:
+      current_plan: "active_at(t, style=plan)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
+
  ask_vqa_top:
    weight: 0.075
    bindings:
@@ -780,6 +780,16 @@ def _msgs_for_interjection(state: dict[str, Any]) -> list[dict[str, Any]]:
    return msgs


+def _msgs_for_plan(state: dict[str, Any]) -> list[dict[str, Any]]:
+    """``plan_generation`` recipe layout — bare task → plan.
+
+    The assistant turn is the generation target, so we only render
+    the user turn at inference; the runtime appends the predicted
+    plan after sampling.
+    """
+    return [{"role": "user", "content": state.get("task") or ""}]
+
+
 def _msgs_for_vqa(question: str) -> list[dict[str, Any]]:
    """``ask_vqa_*`` recipe layout (text-only at inference)."""
    return [{"role": "user", "content": question}]