mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-16 17:20:05 +00:00
feat(recipes): add plan_generation sub-recipe to smolvla2 + pi052 blends
New text-only sub-recipe at 0.10 weight on both blends:
user : ${task}
assistant : ${current_plan} (high_level target)
Bound to ``active_at(t, style=plan)`` so it supervises the
currently-active plan on every frame, gated by ``if_present`` to
skip frames without a plan annotation.
Weights rebalanced: action_execution 0.85 → 0.75, plan_generation
0.10, VQA top/wrist 0.075 each (sums to 1.0).
Added matching runtime builder ``_msgs_for_plan`` in
``smolvla2/inference/steps.py`` so the high-level loop can call
``select_message`` with the bare-task prompt at episode start /
replanning events.
Closes a gap vs. Pi 0.7 §V — without this recipe the model could
read ``${plan}`` from the prompt but never had to produce one.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
blend:
|
||||
|
||||
action_execution:
|
||||
weight: 0.85
|
||||
weight: 0.75
|
||||
bindings:
|
||||
new_memory: "emitted_at(t, style=memory)"
|
||||
messages:
|
||||
@@ -21,6 +21,14 @@ blend:
|
||||
# new memory as a second assistant turn (same forward pass).
|
||||
- {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
bindings:
|
||||
current_plan: "active_at(t, style=plan)"
|
||||
messages:
|
||||
- {role: user, content: "${task}", stream: high_level}
|
||||
- {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
|
||||
|
||||
ask_vqa_top:
|
||||
weight: 0.075
|
||||
bindings:
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
# SmolVLA2 Hi-Robot blend — two flavors:
|
||||
# SmolVLA2 Hi-Robot blend — three flavors:
|
||||
#
|
||||
# 1. action_execution — fused (task + plan + memory) prompt;
|
||||
# supervises the current subtask (low_level: flow + text CE)
|
||||
# and, at memory-boundary frames, the new memory too.
|
||||
# 2. ask_vqa_{top,wrist} — text-only VQA on a camera image,
|
||||
# 2. plan_generation — task → plan (text only). Trains the
|
||||
# model to produce a plan from a bare task description so
|
||||
# the runtime can call it at episode start / replanning.
|
||||
# 3. ask_vqa_{top,wrist} — text-only VQA on a camera image,
|
||||
# gated by ``if_present`` so they only fire on annotated frames.
|
||||
|
||||
blend:
|
||||
|
||||
action_execution:
|
||||
weight: 0.85
|
||||
weight: 0.75
|
||||
bindings:
|
||||
new_memory: "emitted_at(t, style=memory)"
|
||||
messages:
|
||||
@@ -21,6 +24,14 @@ blend:
|
||||
# new memory as a second assistant turn (same forward pass).
|
||||
- {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
bindings:
|
||||
current_plan: "active_at(t, style=plan)"
|
||||
messages:
|
||||
- {role: user, content: "${task}", stream: high_level}
|
||||
- {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
|
||||
|
||||
ask_vqa_top:
|
||||
weight: 0.075
|
||||
bindings:
|
||||
|
||||
@@ -780,6 +780,16 @@ def _msgs_for_interjection(state: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
return msgs
|
||||
|
||||
|
||||
def _msgs_for_plan(state: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
"""``plan_generation`` recipe layout — bare task → plan.
|
||||
|
||||
The assistant turn is the generation target, so we only render
|
||||
the user turn at inference; the runtime appends the predicted
|
||||
plan after sampling.
|
||||
"""
|
||||
return [{"role": "user", "content": state.get("task") or ""}]
|
||||
|
||||
|
||||
def _msgs_for_vqa(question: str) -> list[dict[str, Any]]:
|
||||
"""``ask_vqa_*`` recipe layout (text-only at inference)."""
|
||||
return [{"role": "user", "content": question}]
|
||||
|
||||
Reference in New Issue
Block a user