mirror of
https://github.com/huggingface/lerobot.git
synced 2026-07-01 07:07:08 +00:00
refactor(recipes): consolidate to shared hirobot.yaml + audit fixes
The smolvla2 and pi052 recipe blends had drifted to identical content twice in a row; collapse them to a single ``recipes/hirobot.yaml`` both policies point at. Each backbone's text tokenizer (chat-template for SmolVLA2, plain ``Role: content`` for PI052) handles the rendering differences downstream — the recipe spec is shared. Audit fixes folded into the same commit: * **Train/inference prefix mismatch on the action expert** ``_build_text_batch`` always passed ``add_generation_prompt=True``, appending ``<|im_start|>assistant\\n`` tokens that the action expert never saw at training (the chat tokenizer renders with ``add_generation_prompt=False``). Parameterized the helper and pass ``False`` from ``LowLevelForward``; ``select_message`` paths still default to ``True`` for AR text generation. * **PI052 fallthrough could silently train flow on text-only frames** When ``text_loss_weight=0`` AND every sample was high-level (``predict_actions.any()==False``), the previous heuristic delegated to ``PI05Policy.forward``, which ignores ``predict_actions`` and runs flow on every sample. Reverted to delegating only on fully unannotated batches. * **SmolVLA2 silent zero-loss training** ``forward`` returned ``loss=0`` (no error) when neither flow nor text path fired. Now raises ``RuntimeError`` with the weights and routing flags — fails loud like PI052 already does. * **PI052 dropout-seed key** Was reading ``complementary["dataset_index"]`` (only set by ``MultiDataset`` and means "which sub-dataset", not row index) with fallback to ``frame_index`` (never set) — every sample got seed=0, so per-component dropout was deterministic across the epoch. Switched to ``complementary["index"]`` to match SmolVLA2 and the canonical ``BatchProcessor`` convention. * **Dead ``DEFAULT_TOOLS`` import** Removed from ``chat_processor_smolvla2.py`` — unused since the default-tools list was switched to ``[]`` in the prior commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+19
-20
@@ -1,18 +1,19 @@
|
||||
# SmolVLA2 Hi-Robot blend — π0.5-style split:
|
||||
# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
|
||||
# PI052 (PaliGemma backbone). π0.5-style split:
|
||||
#
|
||||
# The action expert is conditioned on (images, state, subtask)
|
||||
# only — NOT on task / plan / memory. We achieve this by splitting
|
||||
# the work across two main sub-recipes:
|
||||
# The action expert is conditioned on (images, state, subtask) only.
|
||||
# Hierarchical context (task + plan + memory) only flows into the
|
||||
# high-level text head.
|
||||
#
|
||||
# 1. high_level_subtask — text-only. Trains the LM head to predict
|
||||
# the current subtask from (task + plan + memory). At a memory
|
||||
# boundary, also predicts the new memory in the same forward.
|
||||
# 2. low_level_execution — action. Renders just the subtask as the
|
||||
# language conditioning so the action expert's prefix is
|
||||
# [images, subtask, state]. Flow loss + (redundant) text CE on
|
||||
# the subtask itself.
|
||||
# 3. plan_generation — text only. task → plan.
|
||||
# 4. ask_vqa_{top,wrist} — text only. camera-grounded VQA.
|
||||
# high_level_subtask — predict subtask from (task+plan+memory),
|
||||
# and the new memory at boundary frames.
|
||||
# low_level_execution — flow loss with [images, subtask, state].
|
||||
# plan_generation — task → plan.
|
||||
# ask_vqa_{top,wrist} — camera-grounded VQA.
|
||||
#
|
||||
# Each backbone's text tokenizer renders these messages differently
|
||||
# (SmolVLA2 uses the chat template; PI052 concatenates as plain
|
||||
# ``Role: content`` text), but the recipe spec is identical.
|
||||
|
||||
blend:
|
||||
|
||||
@@ -32,13 +33,11 @@ blend:
|
||||
low_level_execution:
|
||||
weight: 0.30
|
||||
messages:
|
||||
# π0.5-style action conditioning: the action expert sees just
|
||||
# the subtask (plus images + state). No text-CE target here —
|
||||
# ``high_level_subtask`` (w=0.50) already trains subtask
|
||||
# prediction from real context; supervising it again as a
|
||||
# copy-from-user turn would dilute the LM head. ``stream:
|
||||
# low_level`` on either turn is enough to flip
|
||||
# ``predict_actions=True`` so the flow loss fires.
|
||||
# π0.5-style action conditioning. The action expert sees only
|
||||
# [images, this user turn (= bare subtask), state]. No text-CE
|
||||
# target — subtask prediction is owned by ``high_level_subtask``.
|
||||
# ``stream: low_level`` flips ``predict_actions=True`` so the
|
||||
# flow loss fires.
|
||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||
|
||||
plan_generation:
|
||||
@@ -1,65 +0,0 @@
|
||||
# π0.5 v2 (pi052) Hi-Robot blend.
|
||||
#
|
||||
# Same shape as ``smolvla2_hirobot.yaml`` — see that file for the
|
||||
# flavor breakdown. The only difference here is the backbone:
|
||||
# PaliGemma isn't chat-pretrained, so ``PI052TextTokenizerStep``
|
||||
# concatenates messages as ``Role: content`` plain text instead
|
||||
# of calling ``apply_chat_template``.
|
||||
|
||||
blend:
|
||||
|
||||
high_level_subtask:
|
||||
weight: 0.50
|
||||
bindings:
|
||||
new_memory: "emitted_at(t, style=memory)"
|
||||
messages:
|
||||
- role: user
|
||||
stream: high_level
|
||||
content: "${task}\nPlan: ${plan}\nMemory: ${memory}"
|
||||
- {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
|
||||
- {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
|
||||
|
||||
low_level_execution:
|
||||
weight: 0.30
|
||||
messages:
|
||||
# Action expert prefix = [images, subtask, state] only — π0.5 style.
|
||||
# No text-CE target: ``high_level_subtask`` already supervises
|
||||
# subtask prediction from real context. ``stream: low_level``
|
||||
# flips ``predict_actions=True`` so the flow loss fires.
|
||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
bindings:
|
||||
current_plan: "active_at(t, style=plan)"
|
||||
messages:
|
||||
- {role: user, content: "${task}", stream: high_level}
|
||||
- {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
|
||||
|
||||
ask_vqa_top:
|
||||
weight: 0.05
|
||||
bindings:
|
||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
|
||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
|
||||
messages:
|
||||
- role: user
|
||||
stream: high_level
|
||||
if_present: vqa_query
|
||||
content:
|
||||
- {type: image, feature: observation.images.front}
|
||||
- {type: text, text: "${vqa_query}"}
|
||||
- {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
|
||||
|
||||
ask_vqa_wrist:
|
||||
weight: 0.05
|
||||
bindings:
|
||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
||||
messages:
|
||||
- role: user
|
||||
stream: high_level
|
||||
if_present: vqa_query
|
||||
content:
|
||||
- {type: image, feature: observation.images.wrist}
|
||||
- {type: text, text: "${vqa_query}"}
|
||||
- {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
|
||||
@@ -24,7 +24,7 @@ Extends :class:`lerobot.policies.pi05.PI05Policy` with:
|
||||
* per-component prompt dropout (Pi 0.7 §V.E) for regularising the
|
||||
text head against missing context at inference.
|
||||
|
||||
See ``src/lerobot/configs/recipes/pi052_hirobot.yaml`` for the
|
||||
See ``src/lerobot/configs/recipes/hirobot.yaml`` for the
|
||||
canonical training recipe and
|
||||
``examples/training/pi052_hirobot.slurm`` for the launcher.
|
||||
"""
|
||||
|
||||
@@ -57,7 +57,7 @@ class PI052Config(PI05Config):
|
||||
"""
|
||||
|
||||
# Recipe / language stack ---------------------------------------------
|
||||
recipe_path: str | None = "recipes/pi052_hirobot.yaml"
|
||||
recipe_path: str | None = "recipes/hirobot.yaml"
|
||||
"""Path (absolute or relative to ``src/lerobot/configs/``) to a
|
||||
``TrainingRecipe`` YAML. Defaults to the canonical Hi-Robot blend
|
||||
shipped alongside this policy. Set to ``None`` to disable recipe
|
||||
|
||||
@@ -366,26 +366,17 @@ class PI052Policy(PI05Policy):
|
||||
text_labels = batch.get("text_labels")
|
||||
predict_actions_t = batch.get("predict_actions")
|
||||
|
||||
# Unannotated datasets / batches with nothing to train: fall
|
||||
# through to PI05Policy so the plain flow-only training surface
|
||||
# keeps working. Triggers when:
|
||||
# * the recipe wasn't applied (no text_labels, no
|
||||
# predict_actions), OR
|
||||
# * every sample's recipe is text-only AND text is disabled
|
||||
# (would otherwise hit the "nothing to train" raise below).
|
||||
text_disabled = (
|
||||
self.config.text_loss_weight <= 0 or text_labels is None
|
||||
)
|
||||
fast_disabled = not getattr(self.config, "enable_fast_action_loss", False)
|
||||
no_flow_samples = (
|
||||
predict_actions_t is not None
|
||||
and not bool(predict_actions_t.any().item())
|
||||
)
|
||||
# Fall through to PI05Policy only on fully unannotated batches
|
||||
# (no recipe applied → no routing fields). For recipe-applied
|
||||
# batches we keep control of the loss dispatch even if all
|
||||
# samples are text-only — delegating would silently train flow
|
||||
# on text-only frames (PI05Policy.forward ignores
|
||||
# ``predict_actions``).
|
||||
if (
|
||||
text_labels is None
|
||||
and predict_actions_t is None
|
||||
and fast_disabled
|
||||
) or (text_disabled and no_flow_samples and fast_disabled):
|
||||
and not getattr(self.config, "enable_fast_action_loss", False)
|
||||
):
|
||||
return super().forward(batch, reduction=reduction)
|
||||
|
||||
run_flow = (
|
||||
|
||||
@@ -252,8 +252,14 @@ class PI052TextTokenizerStep(ProcessorStep):
|
||||
|
||||
seed = self.dropout_seed
|
||||
if seed is None:
|
||||
seed_src = complementary.get("dataset_index") or complementary.get("frame_index") or 0
|
||||
# Canonical row-index key set by ``BatchProcessor`` /
|
||||
# ``render_messages_processor``. Falling back to other
|
||||
# keys silently gave every sample seed=0 → identical
|
||||
# dropout pattern across the whole epoch.
|
||||
seed_src = complementary.get("index", 0)
|
||||
try:
|
||||
if hasattr(seed_src, "item"):
|
||||
seed_src = seed_src.item()
|
||||
seed = int(seed_src)
|
||||
except (TypeError, ValueError):
|
||||
seed = 0
|
||||
|
||||
@@ -45,7 +45,6 @@ from typing import Any
|
||||
import torch
|
||||
|
||||
from lerobot.configs import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.datasets.language import DEFAULT_TOOLS
|
||||
from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry
|
||||
from lerobot.types import EnvTransition, TransitionKey
|
||||
from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
|
||||
@@ -283,7 +282,7 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
|
||||
"""Probabilistically drop non-target context messages.
|
||||
|
||||
Heuristic content sniffing — matches the prefix strings that
|
||||
``smolvla2_hirobot.yaml``'s recipes use when injecting plan /
|
||||
``hirobot.yaml``'s recipes use when injecting plan /
|
||||
memory / subtask / interjection content. Anything else is
|
||||
kept unchanged. Target messages are never dropped (we still
|
||||
need their tokens for supervision).
|
||||
|
||||
@@ -56,7 +56,7 @@ class SmolVLA2Config(SmolVLAConfig):
|
||||
"""
|
||||
|
||||
# Recipe / language stack ---------------------------------------------
|
||||
recipe_path: str | None = "recipes/smolvla2_hirobot.yaml"
|
||||
recipe_path: str | None = "recipes/hirobot.yaml"
|
||||
"""Path (absolute or relative to ``src/lerobot/configs/``) to a
|
||||
``TrainingRecipe`` YAML. The default points at the canonical Hi Robot
|
||||
blend shipped alongside SmolVLA2. Set to ``None`` to disable recipe
|
||||
|
||||
@@ -17,7 +17,7 @@ Each step is a tiny class with a ``trigger`` and an ``__call__(state)``;
|
||||
the runtime applies them in order each tick. When a step's trigger
|
||||
doesn't fire, the step is a no-op and the runtime moves on.
|
||||
|
||||
Stream-to-step mapping mirrors the ``smolvla2_hirobot.yaml`` recipe:
|
||||
Stream-to-step mapping mirrors the ``hirobot.yaml`` recipe:
|
||||
|
||||
* ``LowLevelForward`` — calls ``policy.select_action`` for the
|
||||
action chunk; trained by
|
||||
@@ -120,7 +120,13 @@ class LowLevelForward(InferenceStep):
|
||||
# high-level recipe).
|
||||
subtask = state.get("current_subtask") or state.get("task") or ""
|
||||
ctx = [{"role": "user", "content": subtask}]
|
||||
text_batch = _build_text_batch(self.policy, ctx)
|
||||
# ``add_generation_prompt=False`` to match the training-time
|
||||
# prefix shape: at training the action expert sees the rendered
|
||||
# user turn ending at ``<|im_end|>`` (no trailing
|
||||
# ``<|im_start|>assistant\n``). Passing True here would append
|
||||
# extra role-marker tokens the action expert never saw during
|
||||
# training.
|
||||
text_batch = _build_text_batch(self.policy, ctx, add_generation_prompt=False)
|
||||
from lerobot.utils.constants import ( # noqa: PLC0415
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
@@ -232,7 +238,12 @@ class DispatchAction(InferenceStep):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
def _build_text_batch(
|
||||
policy: Any,
|
||||
prompt_messages: list[dict[str, Any]],
|
||||
*,
|
||||
add_generation_prompt: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""Tokenize a list of chat messages into the batch shape
|
||||
``select_message`` expects.
|
||||
|
||||
@@ -263,7 +274,7 @@ def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dic
|
||||
text_messages = [_strip_lerobot_blocks(m) for m in prompt_messages]
|
||||
encoded = tokenizer.apply_chat_template(
|
||||
text_messages,
|
||||
add_generation_prompt=True,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
tokenize=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
@@ -690,7 +701,7 @@ def _control_context_messages(
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build a chat-template-ready prompt from current runtime state.
|
||||
|
||||
Mirrors what ``smolvla2_hirobot.yaml`` renders into ``${task}\nPlan:
|
||||
Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan:
|
||||
${plan}\nMemory: ${memory}`` for the high-level branches.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
@@ -711,7 +722,7 @@ def _control_context_messages(
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-recipe prompt builders. Each one mirrors a single sub-recipe's
|
||||
# message layout in ``smolvla2_hirobot.yaml`` so the chat-templated
|
||||
# message layout in ``hirobot.yaml`` so the chat-templated
|
||||
# prompt at inference matches what the model saw during training.
|
||||
# Generic ``_control_context_messages`` is kept around as a fallback
|
||||
# for ad-hoc callers but the four high-level steps now use these.
|
||||
|
||||
@@ -246,6 +246,21 @@ class SmolVLA2Policy(SmolVLAPolicy):
|
||||
text_loss = self._compute_text_loss(batch, text_labels)
|
||||
total = total + self.config.text_loss_weight * text_loss
|
||||
loss_dict["text_loss"] = float(text_loss.detach().item())
|
||||
else:
|
||||
# No path fired — happens when both loss weights are 0 or
|
||||
# the batch has neither action samples nor supervised text.
|
||||
# Fail loud rather than train silently on a zero loss.
|
||||
raise RuntimeError(
|
||||
"SmolVLA2Policy.forward: nothing to train — "
|
||||
"flow_loss_weight=%s, text_loss_weight=%s, "
|
||||
"predict_actions.any()=%s, has_text_data=%s"
|
||||
% (
|
||||
self.config.flow_loss_weight,
|
||||
self.config.text_loss_weight,
|
||||
bool(predict_actions_t.any().item()) if has_per_sample_routing else None,
|
||||
has_text_data,
|
||||
)
|
||||
)
|
||||
|
||||
loss_dict["loss"] = float(total.detach().item())
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ def _load_recipe(path_str: str) -> TrainingRecipe:
|
||||
|
||||
Accepts an absolute path or a path relative to
|
||||
``src/lerobot/configs/`` so recipe authors can write
|
||||
``--policy.recipe_path=recipes/smolvla2_hirobot.yaml``.
|
||||
``--policy.recipe_path=recipes/hirobot.yaml``.
|
||||
"""
|
||||
p = Path(path_str)
|
||||
if not p.is_absolute() and not p.exists():
|
||||
|
||||
Reference in New Issue
Block a user