mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
runtime: restore the subtask hierarchy — generated subtask drives actions
Reverts the previous "condition actions on the task" shortcut.
The action expert is conditioned on the SUBTASK again:
* ``low_level_execution`` recipe back to ``user(${subtask})``.
* ``LowLevelForward`` conditions on ``current_subtask`` (falls back
to the task only on the first frame, before the high-level loop
has produced a subtask).
* ``HighLevelSubtaskFwd`` re-added to the runtime pipeline so the
subtask is actually generated each high-level tick and written to
``current_subtask`` before ``LowLevelForward`` consumes it.
* ``_msgs_for_subtask`` now renders just ``${task}`` (no
``Plan: ``/``Memory: `` lines) to match the current
``high_level_subtask`` recipe, whose user turn is the bare task.
So the loop is: task → HighLevelSubtaskFwd (LM head) → subtask →
LowLevelForward → action chunk conditioned on that subtask.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,11 +3,11 @@
|
|||||||
#
|
#
|
||||||
# Trains two things only: subtasks and VQA. Plan and memory are
|
# Trains two things only: subtasks and VQA. Plan and memory are
|
||||||
# intentionally left out for now — keeps the prompt short and the
|
# intentionally left out for now — keeps the prompt short and the
|
||||||
# training surface small while the core action loop is validated.
|
# training surface small while the core subtask + action loop is
|
||||||
|
# validated.
|
||||||
#
|
#
|
||||||
# high_level_subtask — predict the subtask from the task (text
|
# high_level_subtask — predict the subtask from the task.
|
||||||
# head only; not on the inference path yet).
|
# low_level_execution — flow loss with [images, subtask, state].
|
||||||
# low_level_execution — flow loss with [images, task, state].
|
|
||||||
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
||||||
#
|
#
|
||||||
# Each backbone's text tokenizer renders these messages differently
|
# Each backbone's text tokenizer renders these messages differently
|
||||||
@@ -25,15 +25,14 @@ blend:
|
|||||||
low_level_execution:
|
low_level_execution:
|
||||||
weight: 0.40
|
weight: 0.40
|
||||||
messages:
|
messages:
|
||||||
# The action expert is conditioned on the TASK (not the subtask).
|
# The action expert is conditioned on the SUBTASK — at inference
|
||||||
# The task is always available at inference with no high-level
|
# the high-level loop (``HighLevelSubtaskFwd``) generates the
|
||||||
# generation loop, so this removes the train/inference mismatch
|
# subtask via the LM head and feeds it here. The action expert's
|
||||||
# that a subtask-conditioned action head would have while there
|
# prefix is [images, subtask, state]. ``stream: low_level`` flips
|
||||||
# is no reliable runtime subtask source. ``high_level_subtask``
|
# ``predict_actions=True`` so the flow loss fires; no text-CE
|
||||||
# still trains the text head to predict subtasks for later use.
|
# target here (subtask prediction is owned by
|
||||||
# ``stream: low_level`` flips ``predict_actions=True`` so the
|
# ``high_level_subtask``).
|
||||||
# flow loss fires; no text-CE target here.
|
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||||
- {role: user, content: "${task}", stream: low_level}
|
|
||||||
|
|
||||||
ask_vqa_top:
|
ask_vqa_top:
|
||||||
weight: 0.10
|
weight: 0.10
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from .steps import (
|
|||||||
AskVQAFwd,
|
AskVQAFwd,
|
||||||
DispatchAction,
|
DispatchAction,
|
||||||
DispatchToolCalls,
|
DispatchToolCalls,
|
||||||
|
HighLevelSubtaskFwd,
|
||||||
InferenceStep,
|
InferenceStep,
|
||||||
LowLevelForward,
|
LowLevelForward,
|
||||||
)
|
)
|
||||||
@@ -66,24 +67,29 @@ class SmolVLA2Runtime:
|
|||||||
_stop: bool = field(default=False, init=False)
|
_stop: bool = field(default=False, init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
# VQA-only configuration (current scope). The training recipe
|
# Subtask + VQA configuration (current scope — plan and memory
|
||||||
# supervises only subtasks + VQA — plan and memory are out for
|
# are not trained yet). Pipeline:
|
||||||
# now — so the runtime drops the high-level subtask /
|
|
||||||
# memory-update / interjection steps. The remaining loop is:
|
|
||||||
#
|
#
|
||||||
# AskVQAFwd → answer camera-grounded questions on stdin
|
# HighLevelSubtaskFwd → generate the next subtask via the LM
|
||||||
# LowLevelForward → action chunk (conditioned on the task
|
# head at ~``high_level_hz``; writes
|
||||||
# string directly, since no subtask is
|
# ``current_subtask``
|
||||||
# being generated — see LowLevelForward's
|
# AskVQAFwd → answer camera-grounded stdin questions
|
||||||
# ``current_subtask or task`` fallback)
|
# LowLevelForward → action chunk conditioned on the
|
||||||
# DispatchAction → drain the chunk to the robot
|
# generated ``current_subtask``
|
||||||
# DispatchToolCalls → fire any pending tool calls
|
# DispatchAction → drain the chunk to the robot
|
||||||
|
# DispatchToolCalls → fire any pending tool calls
|
||||||
#
|
#
|
||||||
# ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` /
|
# Order matters: ``HighLevelSubtaskFwd`` and ``LowLevelForward``
|
||||||
# ``UserInterjectionFwd`` are still importable from
|
# are both gated on "action queue empty", so the subtask must
|
||||||
# ``inference.steps`` — re-add them here once plan / memory /
|
# refresh *before* the chunk that consumes it. ``MemoryUpdateFwd``
|
||||||
# subtask generation is back in scope.
|
# / ``UserInterjectionFwd`` are still importable from
|
||||||
|
# ``inference.steps`` — re-add once plan / memory are in scope.
|
||||||
self.pipeline = [
|
self.pipeline = [
|
||||||
|
HighLevelSubtaskFwd(
|
||||||
|
trigger=HzTrigger(self.high_level_hz),
|
||||||
|
policy=self.policy,
|
||||||
|
observation_provider=self.observation_provider,
|
||||||
|
),
|
||||||
AskVQAFwd(
|
AskVQAFwd(
|
||||||
policy=self.policy,
|
policy=self.policy,
|
||||||
observation_provider=self.observation_provider,
|
observation_provider=self.observation_provider,
|
||||||
|
|||||||
@@ -111,12 +111,14 @@ class LowLevelForward(InferenceStep):
|
|||||||
if observation is None:
|
if observation is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# The action expert is conditioned on the TASK string — the
|
# The action expert is conditioned on the SUBTASK generated by
|
||||||
# ``low_level_execution`` recipe renders ``user(${task})``.
|
# the high-level loop (``HighLevelSubtaskFwd`` runs earlier in
|
||||||
# The task is stable for the whole episode and always present,
|
# the pipeline and writes ``current_subtask``). Matches the
|
||||||
# so there is no train/inference mismatch and no dependency on
|
# training-time ``low_level_execution`` recipe — ``user(${subtask})``.
|
||||||
# a (currently unreliable) high-level subtask generator.
|
# Falls back to the task string only on the very first frame,
|
||||||
ctx = [{"role": "user", "content": state.get("task") or ""}]
|
# before the high-level loop has produced a subtask.
|
||||||
|
subtask = state.get("current_subtask") or state.get("task") or ""
|
||||||
|
ctx = [{"role": "user", "content": subtask}]
|
||||||
# ``add_generation_prompt=False`` to match the training-time
|
# ``add_generation_prompt=False`` to match the training-time
|
||||||
# prefix shape: at training the action expert sees the rendered
|
# prefix shape: at training the action expert sees the rendered
|
||||||
# user turn ending at ``<|im_end|>`` (no trailing
|
# user turn ending at ``<|im_end|>`` (no trailing
|
||||||
@@ -744,11 +746,12 @@ def _hirobot_user_head(state: dict[str, Any]) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]:
|
def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
"""``high_level_subtask`` recipe layout — predict the current subtask
|
"""``high_level_subtask`` recipe layout — predict the subtask from the
|
||||||
from (task + plan + memory). Even when plan / memory aren't set yet
|
task. The v-current recipe's user turn is just ``${task}`` (plan and
|
||||||
the labels render as bare ``Plan: `` / ``Memory: `` to match training.
|
memory are not trained), so the inference prompt is the bare task —
|
||||||
|
no ``Plan: `` / ``Memory: `` lines.
|
||||||
"""
|
"""
|
||||||
return [{"role": "user", "content": _hirobot_user_head(state)}]
|
return [{"role": "user", "content": state.get("task") or ""}]
|
||||||
|
|
||||||
|
|
||||||
def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]:
|
def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
|
|||||||
Reference in New Issue
Block a user