mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 08:39:49 +00:00
recipe+runtime: VQA + subtask only — drop plan & memory
Scope reduction while the core subtask + action loop is validated:
Recipe (hirobot.yaml)
* Removed ``plan_generation`` sub-recipe entirely.
* Removed the memory tail from ``high_level_subtask`` (the
``new_memory`` binding + the second assistant turn).
* ``high_level_subtask`` user turn is now just ``${task}`` — no
``Plan: …\nMemory: …`` context.
* Weights rebalanced over the four remaining sub-recipes:
high_level_subtask 0.40, low_level_execution 0.40,
ask_vqa_top/wrist 0.10 each.
Runtime (inference/runtime.py)
* Pipeline trimmed to VQA + the action loop:
AskVQAFwd → LowLevelForward → DispatchAction → DispatchToolCalls.
* Dropped HighLevelSubtaskFwd / MemoryUpdateFwd / UserInterjectionFwd
from the default pipeline. They remain importable from
``inference.steps`` for when plan/memory/subtask generation is
brought back. The action expert conditions on the task string
directly via LowLevelForward's ``current_subtask or task``
fallback.
This commit lands on top of a rollback of the previous two commits
(repetition_penalty / no_repeat_ngram_size knobs, and the
deterministic plan-walker) — both were bandaids for the LM-head
repetition collapse that the reduced-scope recipe sidesteps.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,13 @@
|
||||
# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
|
||||
# PI052 (PaliGemma backbone). π0.5-style split:
|
||||
# PI052 (PaliGemma backbone).
|
||||
#
|
||||
# The action expert is conditioned on (images, state, subtask) only.
|
||||
# Hierarchical context (task + plan + memory) only flows into the
|
||||
# high-level text head.
|
||||
# Trains two things only: subtasks and VQA. Plan and memory are
|
||||
# intentionally left out for now — keeps the prompt short and the
|
||||
# training surface small while the core subtask + action loop is
|
||||
# validated.
|
||||
#
|
||||
# high_level_subtask — predict subtask from (task+plan+memory),
|
||||
# and the new memory at boundary frames.
|
||||
# high_level_subtask — predict the subtask from the task.
|
||||
# low_level_execution — flow loss with [images, subtask, state].
|
||||
# plan_generation — task → plan.
|
||||
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
||||
#
|
||||
# Each backbone's text tokenizer renders these messages differently
|
||||
@@ -18,20 +17,13 @@
|
||||
blend:
|
||||
|
||||
high_level_subtask:
|
||||
weight: 0.50
|
||||
bindings:
|
||||
new_memory: "emitted_at(t, style=memory)"
|
||||
weight: 0.40
|
||||
messages:
|
||||
- role: user
|
||||
stream: high_level
|
||||
content: "${task}\nPlan: ${plan}\nMemory: ${memory}"
|
||||
- {role: user, content: "${task}", stream: high_level}
|
||||
- {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
|
||||
# Boundary-frame tail: at a subtask transition, also predict
|
||||
# the new memory in the same forward pass.
|
||||
- {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
|
||||
|
||||
low_level_execution:
|
||||
weight: 0.30
|
||||
weight: 0.40
|
||||
messages:
|
||||
# π0.5-style action conditioning. The action expert sees only
|
||||
# [images, this user turn (= bare subtask), state]. No text-CE
|
||||
@@ -40,16 +32,8 @@ blend:
|
||||
# flow loss fires.
|
||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
bindings:
|
||||
current_plan: "active_at(t, style=plan)"
|
||||
messages:
|
||||
- {role: user, content: "${task}", stream: high_level}
|
||||
- {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
|
||||
|
||||
ask_vqa_top:
|
||||
weight: 0.05
|
||||
weight: 0.10
|
||||
bindings:
|
||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
|
||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
|
||||
@@ -63,7 +47,7 @@ blend:
|
||||
- {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
|
||||
|
||||
ask_vqa_wrist:
|
||||
weight: 0.05
|
||||
weight: 0.10
|
||||
bindings:
|
||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
||||
|
||||
@@ -30,11 +30,8 @@ from .steps import (
|
||||
AskVQAFwd,
|
||||
DispatchAction,
|
||||
DispatchToolCalls,
|
||||
HighLevelSubtaskFwd,
|
||||
InferenceStep,
|
||||
LowLevelForward,
|
||||
MemoryUpdateFwd,
|
||||
UserInterjectionFwd,
|
||||
)
|
||||
from .triggers import HzTrigger, TickClock
|
||||
|
||||
@@ -69,31 +66,24 @@ class SmolVLA2Runtime:
|
||||
_stop: bool = field(default=False, init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Pipeline order matters. Both ``HighLevelSubtaskFwd`` and
|
||||
# ``LowLevelForward`` are gated on "action queue is empty" so
|
||||
# the slow LLM call (select_message) doesn't starve dispatch.
|
||||
# If LowLevelForward runs first, it refills the queue and the
|
||||
# high-level step never sees ``queue == 0`` afterwards.
|
||||
# VQA-only configuration (current scope). The training recipe
|
||||
# supervises only subtasks + VQA — plan and memory are out for
|
||||
# now — so the runtime drops the high-level subtask /
|
||||
# memory-update / interjection steps. The remaining loop is:
|
||||
#
|
||||
# Order is therefore: high-level steps that read state (subtask,
|
||||
# memory, interjection, vqa) → low-level chunk refresh → action
|
||||
# dispatch → tool dispatch. So on an empty-queue tick the
|
||||
# subtask refreshes first, the new subtask string flows into
|
||||
# the next chunk's prompt, and DispatchAction drains.
|
||||
# AskVQAFwd → answer camera-grounded questions on stdin
|
||||
# LowLevelForward → action chunk (conditioned on the task
|
||||
# string directly, since no subtask is
|
||||
# being generated — see LowLevelForward's
|
||||
# ``current_subtask or task`` fallback)
|
||||
# DispatchAction → drain the chunk to the robot
|
||||
# DispatchToolCalls → fire any pending tool calls
|
||||
#
|
||||
# ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` /
|
||||
# ``UserInterjectionFwd`` are still importable from
|
||||
# ``inference.steps`` — re-add them here once plan / memory /
|
||||
# subtask generation is back in scope.
|
||||
self.pipeline = [
|
||||
HighLevelSubtaskFwd(
|
||||
trigger=HzTrigger(self.high_level_hz),
|
||||
policy=self.policy,
|
||||
observation_provider=self.observation_provider,
|
||||
),
|
||||
MemoryUpdateFwd(
|
||||
policy=self.policy,
|
||||
observation_provider=self.observation_provider,
|
||||
),
|
||||
UserInterjectionFwd(
|
||||
policy=self.policy,
|
||||
observation_provider=self.observation_provider,
|
||||
),
|
||||
AskVQAFwd(
|
||||
policy=self.policy,
|
||||
observation_provider=self.observation_provider,
|
||||
|
||||
Reference in New Issue
Block a user