mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 03:59:42 +00:00
recipe+runtime: VQA + subtask only — drop plan & memory
Scope reduction while the core subtask + action loop is validated:
Recipe (hirobot.yaml)
* Removed ``plan_generation`` sub-recipe entirely.
* Removed the memory tail from ``high_level_subtask`` (the
``new_memory`` binding + the second assistant turn).
* ``high_level_subtask`` user turn is now just ``${task}`` — no
``Plan: …\nMemory: …`` context.
* Weights rebalanced over the four remaining sub-recipes:
high_level_subtask 0.40, low_level_execution 0.40,
ask_vqa_top/wrist 0.10 each.
Runtime (inference/runtime.py)
* Pipeline trimmed to VQA + the action loop:
AskVQAFwd → LowLevelForward → DispatchAction → DispatchToolCalls.
* Dropped HighLevelSubtaskFwd / MemoryUpdateFwd / UserInterjectionFwd
from the default pipeline. They remain importable from
``inference.steps`` for when plan/memory/subtask generation is
brought back. The action expert conditions on the task string
directly via LowLevelForward's ``current_subtask or task``
fallback.
This commit lands on top of a rollback of the previous two commits
(repetition_penalty / no_repeat_ngram_size knobs, and the
deterministic plan-walker) — both were bandaids for the LM-head
repetition collapse that the reduced-scope recipe sidesteps.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,13 @@
|
|||||||
# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
|
# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
|
||||||
# PI052 (PaliGemma backbone). π0.5-style split:
|
# PI052 (PaliGemma backbone).
|
||||||
#
|
#
|
||||||
# The action expert is conditioned on (images, state, subtask) only.
|
# Trains two things only: subtasks and VQA. Plan and memory are
|
||||||
# Hierarchical context (task + plan + memory) only flows into the
|
# intentionally left out for now — keeps the prompt short and the
|
||||||
# high-level text head.
|
# training surface small while the core subtask + action loop is
|
||||||
|
# validated.
|
||||||
#
|
#
|
||||||
# high_level_subtask — predict subtask from (task+plan+memory),
|
# high_level_subtask — predict the subtask from the task.
|
||||||
# and the new memory at boundary frames.
|
|
||||||
# low_level_execution — flow loss with [images, subtask, state].
|
# low_level_execution — flow loss with [images, subtask, state].
|
||||||
# plan_generation — task → plan.
|
|
||||||
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
||||||
#
|
#
|
||||||
# Each backbone's text tokenizer renders these messages differently
|
# Each backbone's text tokenizer renders these messages differently
|
||||||
@@ -18,20 +17,13 @@
|
|||||||
blend:
|
blend:
|
||||||
|
|
||||||
high_level_subtask:
|
high_level_subtask:
|
||||||
weight: 0.50
|
weight: 0.40
|
||||||
bindings:
|
|
||||||
new_memory: "emitted_at(t, style=memory)"
|
|
||||||
messages:
|
messages:
|
||||||
- role: user
|
- {role: user, content: "${task}", stream: high_level}
|
||||||
stream: high_level
|
|
||||||
content: "${task}\nPlan: ${plan}\nMemory: ${memory}"
|
|
||||||
- {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
|
- {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
|
||||||
# Boundary-frame tail: at a subtask transition, also predict
|
|
||||||
# the new memory in the same forward pass.
|
|
||||||
- {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
|
|
||||||
|
|
||||||
low_level_execution:
|
low_level_execution:
|
||||||
weight: 0.30
|
weight: 0.40
|
||||||
messages:
|
messages:
|
||||||
# π0.5-style action conditioning. The action expert sees only
|
# π0.5-style action conditioning. The action expert sees only
|
||||||
# [images, this user turn (= bare subtask), state]. No text-CE
|
# [images, this user turn (= bare subtask), state]. No text-CE
|
||||||
@@ -40,16 +32,8 @@ blend:
|
|||||||
# flow loss fires.
|
# flow loss fires.
|
||||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||||
|
|
||||||
plan_generation:
|
|
||||||
weight: 0.10
|
|
||||||
bindings:
|
|
||||||
current_plan: "active_at(t, style=plan)"
|
|
||||||
messages:
|
|
||||||
- {role: user, content: "${task}", stream: high_level}
|
|
||||||
- {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
|
|
||||||
|
|
||||||
ask_vqa_top:
|
ask_vqa_top:
|
||||||
weight: 0.05
|
weight: 0.10
|
||||||
bindings:
|
bindings:
|
||||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
|
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
|
||||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
|
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
|
||||||
@@ -63,7 +47,7 @@ blend:
|
|||||||
- {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
|
- {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
|
||||||
|
|
||||||
ask_vqa_wrist:
|
ask_vqa_wrist:
|
||||||
weight: 0.05
|
weight: 0.10
|
||||||
bindings:
|
bindings:
|
||||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
||||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
||||||
|
|||||||
@@ -30,11 +30,8 @@ from .steps import (
|
|||||||
AskVQAFwd,
|
AskVQAFwd,
|
||||||
DispatchAction,
|
DispatchAction,
|
||||||
DispatchToolCalls,
|
DispatchToolCalls,
|
||||||
HighLevelSubtaskFwd,
|
|
||||||
InferenceStep,
|
InferenceStep,
|
||||||
LowLevelForward,
|
LowLevelForward,
|
||||||
MemoryUpdateFwd,
|
|
||||||
UserInterjectionFwd,
|
|
||||||
)
|
)
|
||||||
from .triggers import HzTrigger, TickClock
|
from .triggers import HzTrigger, TickClock
|
||||||
|
|
||||||
@@ -69,31 +66,24 @@ class SmolVLA2Runtime:
|
|||||||
_stop: bool = field(default=False, init=False)
|
_stop: bool = field(default=False, init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
# Pipeline order matters. Both ``HighLevelSubtaskFwd`` and
|
# VQA-only configuration (current scope). The training recipe
|
||||||
# ``LowLevelForward`` are gated on "action queue is empty" so
|
# supervises only subtasks + VQA — plan and memory are out for
|
||||||
# the slow LLM call (select_message) doesn't starve dispatch.
|
# now — so the runtime drops the high-level subtask /
|
||||||
# If LowLevelForward runs first, it refills the queue and the
|
# memory-update / interjection steps. The remaining loop is:
|
||||||
# high-level step never sees ``queue == 0`` afterwards.
|
|
||||||
#
|
#
|
||||||
# Order is therefore: high-level steps that read state (subtask,
|
# AskVQAFwd → answer camera-grounded questions on stdin
|
||||||
# memory, interjection, vqa) → low-level chunk refresh → action
|
# LowLevelForward → action chunk (conditioned on the task
|
||||||
# dispatch → tool dispatch. So on an empty-queue tick the
|
# string directly, since no subtask is
|
||||||
# subtask refreshes first, the new subtask string flows into
|
# being generated — see LowLevelForward's
|
||||||
# the next chunk's prompt, and DispatchAction drains.
|
# ``current_subtask or task`` fallback)
|
||||||
|
# DispatchAction → drain the chunk to the robot
|
||||||
|
# DispatchToolCalls → fire any pending tool calls
|
||||||
|
#
|
||||||
|
# ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` /
|
||||||
|
# ``UserInterjectionFwd`` are still importable from
|
||||||
|
# ``inference.steps`` — re-add them here once plan / memory /
|
||||||
|
# subtask generation is back in scope.
|
||||||
self.pipeline = [
|
self.pipeline = [
|
||||||
HighLevelSubtaskFwd(
|
|
||||||
trigger=HzTrigger(self.high_level_hz),
|
|
||||||
policy=self.policy,
|
|
||||||
observation_provider=self.observation_provider,
|
|
||||||
),
|
|
||||||
MemoryUpdateFwd(
|
|
||||||
policy=self.policy,
|
|
||||||
observation_provider=self.observation_provider,
|
|
||||||
),
|
|
||||||
UserInterjectionFwd(
|
|
||||||
policy=self.policy,
|
|
||||||
observation_provider=self.observation_provider,
|
|
||||||
),
|
|
||||||
AskVQAFwd(
|
AskVQAFwd(
|
||||||
policy=self.policy,
|
policy=self.policy,
|
||||||
observation_provider=self.observation_provider,
|
observation_provider=self.observation_provider,
|
||||||
|
|||||||
Reference in New Issue
Block a user