feat(smolvla2): chat-template processor + label mask + predict_actions

Wires PR 1's recipe stack into the SmolVLA2 pipeline so multi-target
sub-recipes (memory_update, ask_vqa, user_interjection_response,
high_level_subtask) carry meaningful supervision through to the model.

- New ``chat_processor_smolvla2.py`` with
  ``SmolVLA2ChatTokenizerStep``: reads ``messages`` /
  ``message_streams`` / ``target_message_indices`` from the rendered
  sample (PR 1 ``RenderMessagesStep``), calls
  ``apply_chat_template(messages, tools=DEFAULT_TOOLS, ...)`` on the
  SmolVLM tokenizer, and writes:

    OBS_LANGUAGE_TOKENS / _ATTENTION_MASK   ← chat-templated prompt
    text_labels                              ← -100 except target msg tokens
    predict_actions                          ← True iff any low_level target

  Builds the label mask robustly by re-rendering the chat through
  each target's prefix and reading off the prefix length — same
  tokenizer, same tools, so the prefix tokens are guaranteed to be
  a prefix of the full sequence. Image/video content blocks
  (LeRobot ``feature``-keyed) are stripped before tokenizing; the
  actual image tensors flow through SmolVLA's existing
  ``OBS_IMAGES_*`` channels and ``embed_prefix`` puts them before
  the language embeddings, matching the chat-template-stripped
  text order.

- ``processor_smolvla2.py``: when ``config.recipe_path`` is set,
  build a new pipeline with ``RenderMessagesStep`` +
  ``SmolVLA2ChatTokenizerStep`` instead of SmolVLA's plain
  ``TokenizerProcessorStep``. When ``recipe_path`` is ``None``,
  fall back to SmolVLA's pipeline so unannotated datasets still
  work unchanged. Resolves recipe paths relative to
  ``src/lerobot/configs/`` so ``recipes/smolvla2_hirobot.yaml``
  works directly.

The next commit on this branch picks up ``text_labels`` and
``predict_actions`` from the batch and routes them through the
SmolVLM ``lm_head`` for the actual dual-loss training.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-04-30 19:21:03 +02:00
parent 52e1fd35cb
commit 37b1eb218a
2 changed files with 366 additions and 19 deletions
@@ -0,0 +1,271 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SmolVLA2's chat-template tokenization step.
Replaces SmolVLA's plain ``TokenizerProcessorStep`` for SmolVLA2 when a
``recipe_path`` is set. Reads the rendered messages produced by
``RenderMessagesStep`` (PR 1) and produces:
* ``OBS_LANGUAGE_TOKENS`` / ``OBS_LANGUAGE_ATTENTION_MASK`` —
the chat-templated prompt tokenized by SmolVLM's tokenizer, with
``tools=meta.tools`` (PR 1's catalog).
* ``text_labels`` — same shape as token ids, ``-100`` everywhere except
the positions belonging to messages whose index is in
``target_message_indices``. The next commit's modeling forward path
applies cross-entropy on those positions via the SmolVLM ``lm_head``.
* ``predict_actions`` — bool tensor, ``True`` iff any of the rendered
target messages has ``message_streams[i] == "low_level"``. The
modeling forward uses this to gate the flow head.
Image / video content blocks in the rendered messages are dropped
before tokenization — the chat template only handles text, and SmolVLA
already passes camera tensors out-of-band via the standard
``OBS_IMAGES_*`` features. This keeps the prefix layout unchanged
(``embed_prefix`` puts image embeddings before language embeddings,
matching the chat-template-stripped text order).
"""
from __future__ import annotations
import copy
import logging
from dataclasses import dataclass
from typing import Any
import torch
from lerobot.configs import PipelineFeatureType, PolicyFeature
from lerobot.datasets.language import DEFAULT_TOOLS
from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry
from lerobot.types import EnvTransition, TransitionKey
from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
logger = logging.getLogger(__name__)
@dataclass
@ProcessorStepRegistry.register(name="smolvla2_chat_tokenizer")
class SmolVLA2ChatTokenizerStep(ProcessorStep):
"""Render messages → token ids + label mask + predict_actions flag.
This is the bridge between the recipe stack (PR 1's
``RenderMessagesStep`` outputs) and the SmolVLA2 modeling forward
(next commit, which reads ``text_labels`` / ``predict_actions``).
Pure-text turns and multi-stream targets are both handled.
"""
tokenizer_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
max_length: int = 2048
padding: str = "longest"
padding_side: str = "right"
tools: list[dict[str, Any]] | None = None
def __post_init__(self) -> None:
# Lazy: don't load the tokenizer until the step actually runs,
# so unit tests that import the module without transformers
# installed still pass.
self._tokenizer: Any = None
if self.tools is None:
# Default: ship the canonical ``say`` schema. Users who set
# ``meta.tools`` differently can override via
# ``with_tools(meta.tools)``.
self.tools = list(DEFAULT_TOOLS)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def with_tools(self, tools: list[dict[str, Any]]) -> "SmolVLA2ChatTokenizerStep":
"""Override the tools catalog rendered into the system prompt."""
self.tools = list(tools)
return self
def __call__(self, transition: EnvTransition) -> EnvTransition | None:
comp = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
messages = comp.get("messages")
if not messages:
# No recipe rendering happened — nothing to do; downstream
# falls back to whatever ``task`` is in the transition.
return transition
message_streams: list[str | None] = list(comp.get("message_streams") or [])
target_indices: list[int] = sorted(
int(i) for i in (comp.get("target_message_indices") or [])
)
tokenizer = self._get_tokenizer()
text_messages = [_strip_lerobot_blocks(m) for m in messages]
# Tokenize the full chat once.
full_ids = tokenizer.apply_chat_template(
text_messages,
tools=self.tools,
add_generation_prompt=False,
tokenize=True,
return_tensors=None,
)
if isinstance(full_ids, list) and full_ids and isinstance(full_ids[0], list):
full_ids = full_ids[0]
# Build the label mask by re-rendering progressively up to each
# target message and reading off the prefix length. This is the
# robust way to get exact token boundaries: we use the same
# tokenizer, the same ``tools=`` argument, and the same chat
# template — so the prefix tokens are guaranteed to be a prefix
# of the full sequence.
labels = [-100] * len(full_ids)
for tgt in target_indices:
prefix_ids = tokenizer.apply_chat_template(
text_messages[:tgt],
tools=self.tools,
add_generation_prompt=False,
tokenize=True,
return_tensors=None,
)
full_through_target = tokenizer.apply_chat_template(
text_messages[: tgt + 1],
tools=self.tools,
add_generation_prompt=False,
tokenize=True,
return_tensors=None,
)
if isinstance(prefix_ids, list) and prefix_ids and isinstance(prefix_ids[0], list):
prefix_ids = prefix_ids[0]
if (
isinstance(full_through_target, list)
and full_through_target
and isinstance(full_through_target[0], list)
):
full_through_target = full_through_target[0]
start = len(prefix_ids)
end = min(len(full_through_target), len(full_ids))
for pos in range(start, end):
labels[pos] = int(full_ids[pos])
# Truncate / pad to ``max_length`` so batches collate cleanly.
# The SmolVLA pipeline downstream relies on a fixed length
# behaviour ("longest" or "max_length") — we mirror it here.
if len(full_ids) > self.max_length:
full_ids = full_ids[: self.max_length]
labels = labels[: self.max_length]
attn = [1] * len(full_ids)
if self.padding == "max_length" and len(full_ids) < self.max_length:
pad_id = (
tokenizer.pad_token_id
if tokenizer.pad_token_id is not None
else 0
)
n_pad = self.max_length - len(full_ids)
full_ids = full_ids + [pad_id] * n_pad
labels = labels + [-100] * n_pad
attn = attn + [0] * n_pad
ids_t = torch.tensor(full_ids, dtype=torch.long)
attn_t = torch.tensor(attn, dtype=torch.bool)
labels_t = torch.tensor(labels, dtype=torch.long)
predict_actions = any(
i < len(message_streams) and message_streams[i] == "low_level"
for i in target_indices
)
new_complementary = dict(comp)
# Drop the per-recipe sidecar keys; everything downstream needs
# is now in the tokenized form.
new_complementary.pop("messages", None)
new_complementary.pop("message_streams", None)
new_complementary.pop("target_message_indices", None)
# SmolVLA's pipeline expects ``OBS_LANGUAGE_TOKENS`` /
# ``OBS_LANGUAGE_ATTENTION_MASK`` on the OBSERVATION key. Place
# them there — and drop ``task`` so the upstream
# ``TokenizerProcessorStep`` (which we replace) doesn't double-
# tokenize.
observation = dict(transition.get(TransitionKey.OBSERVATION) or {})
observation[OBS_LANGUAGE_TOKENS] = ids_t
observation[OBS_LANGUAGE_ATTENTION_MASK] = attn_t
new_complementary["text_labels"] = labels_t
new_complementary["predict_actions"] = torch.tensor(predict_actions, dtype=torch.bool)
new_complementary.pop("task", None)
new_transition = dict(transition)
new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary
new_transition[TransitionKey.OBSERVATION] = observation
return new_transition
def transform_features(
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
"""Pass-through; this step writes runtime tensors not features."""
return features
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def _get_tokenizer(self): # noqa: ANN202
if self._tokenizer is not None:
return self._tokenizer
try:
from transformers import AutoTokenizer # noqa: PLC0415
except ImportError as exc: # pragma: no cover
raise ImportError(
"SmolVLA2ChatTokenizerStep requires transformers. "
"`pip install lerobot[transformers-dep]`."
) from exc
self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None:
self._tokenizer.pad_token = self._tokenizer.eos_token
return self._tokenizer
def _strip_lerobot_blocks(message: dict[str, Any]) -> dict[str, Any]:
"""Remove LeRobot-specific multimodal blocks from ``message`` content.
The recipe DSL allows authors to write multimodal content like
``{"type": "image", "feature": "observation.images.top"}``. SmolVLM's
tokenizer doesn't know that ``feature`` key (it expects ``url`` or
``path``). The actual image tensor flows through SmolVLA's
``OBS_IMAGES_*`` channels separately; the chat template only needs
the text. So we strip non-text blocks before tokenizing.
"""
new = dict(message)
content = new.get("content")
if isinstance(content, list):
text_parts: list[dict[str, Any]] = []
for block in content:
if not isinstance(block, dict):
continue
if block.get("type") == "text":
text_parts.append({"type": "text", "text": str(block.get("text", ""))})
# If only one text block survives, flatten to a string for
# template friendliness; some chat templates choke on a single-
# element list.
if len(text_parts) == 1:
new["content"] = text_parts[0]["text"]
elif text_parts:
new["content"] = text_parts
else:
new["content"] = ""
if "tool_calls" in new and not new["tool_calls"]:
# Drop empty tool_calls — some templates render them as a
# spurious empty marker.
new.pop("tool_calls")
# ``stream`` and ``target`` were recipe metadata; templates don't
# know them and may warn or crash.
new.pop("stream", None)
new.pop("target", None)
return new
# Re-export for tests / introspection
strip_lerobot_blocks = _strip_lerobot_blocks
@@ -13,43 +13,119 @@
# limitations under the License.
"""SmolVLA2 processor pipelines.
SCAFFOLD: this currently delegates to SmolVLA's processor. The next
commit on this branch replaces that with a chat-template aware pipeline:
When ``config.recipe_path`` is set, the pre-processor pipeline becomes:
RenderMessagesStep (PR1) → SmolVLA2ChatTokenizerStep → existing SmolVLA
normalization / device steps.
rename observations
add batch dim
RenderMessagesStep(recipe) # PR 1: language_* → messages
SmolVLA2ChatTokenizerStep(...) # chat template + label mask + predict_actions
DeviceProcessorStep
NormalizerProcessorStep
The chat tokenizer step will:
When ``config.recipe_path`` is ``None``, we delegate to SmolVLA's
plain task-string pipeline so unannotated datasets still work.
* take ``messages`` / ``message_streams`` / ``target_message_indices``
from the rendered sample,
* call ``apply_chat_template(messages, tools=DEFAULT_TOOLS, ...)`` on the
SmolVLM tokenizer,
* tokenize the resulting prompt,
* build a ``text_labels`` tensor with ``-100`` everywhere except the
token positions belonging to messages whose index is in
``target_message_indices``,
* derive ``predict_actions = bool(targets_by_stream.get("low_level"))``.
Post-processor is unchanged from SmolVLA.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import torch
from lerobot.configs.recipe import TrainingRecipe
from lerobot.processor import (
AddBatchDimensionProcessorStep,
DeviceProcessorStep,
NormalizerProcessorStep,
PolicyAction,
PolicyProcessorPipeline,
RenameObservationsProcessorStep,
RenderMessagesStep,
UnnormalizerProcessorStep,
policy_action_to_transition,
transition_to_policy_action,
)
from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
from ..smolvla.processor_smolvla import make_smolvla_pre_post_processors
from .chat_processor_smolvla2 import SmolVLA2ChatTokenizerStep
from .configuration_smolvla2 import SmolVLA2Config
def make_smolvla2_pre_post_processors(
config: SmolVLA2Config,
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
) -> tuple[Any, Any]:
) -> tuple[
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
PolicyProcessorPipeline[PolicyAction, PolicyAction],
]:
"""Build SmolVLA2's pre/post-processor pipelines.
SCAFFOLD: just delegates to ``make_smolvla_pre_post_processors`` so
SmolVLA2 inherits SmolVLA's tokenization + normalization for now.
The recipe-driven chat-template rendering arrives in the next commit.
With ``recipe_path`` set, inserts the recipe-rendering step and the
chat-template tokenizer that emits ``text_labels`` and
``predict_actions`` for the dual-loss path. Without it, falls back
to SmolVLA's plain task-string pipeline so unannotated datasets
keep working unchanged.
"""
return make_smolvla_pre_post_processors(config, dataset_stats=dataset_stats)
if not config.recipe_path:
return make_smolvla_pre_post_processors(config, dataset_stats=dataset_stats)
recipe = _load_recipe(config.recipe_path)
input_steps = [
RenameObservationsProcessorStep(rename_map={}),
AddBatchDimensionProcessorStep(),
RenderMessagesStep(recipe=recipe),
SmolVLA2ChatTokenizerStep(
tokenizer_name=config.vlm_model_name,
max_length=config.tokenizer_max_length,
padding=config.pad_language_to,
),
DeviceProcessorStep(device=config.device),
NormalizerProcessorStep(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
]
output_steps = [
UnnormalizerProcessorStep(
features=config.output_features,
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
DeviceProcessorStep(device="cpu"),
]
return (
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
steps=input_steps,
name=POLICY_PREPROCESSOR_DEFAULT_NAME,
),
PolicyProcessorPipeline[PolicyAction, PolicyAction](
steps=output_steps,
name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
to_transition=policy_action_to_transition,
to_output=transition_to_policy_action,
),
)
def _load_recipe(path_str: str) -> TrainingRecipe:
"""Resolve ``path_str`` to a ``TrainingRecipe``.
Accepts an absolute path or a path relative to
``src/lerobot/configs/`` so recipe authors can write
``--policy.recipe_path=recipes/smolvla2_hirobot.yaml``.
"""
p = Path(path_str)
if not p.is_absolute() and not p.exists():
from lerobot.configs import recipe as _recipe_module # noqa: PLC0415
configs_dir = Path(_recipe_module.__file__).resolve().parent
candidate = configs_dir / path_str
if candidate.exists():
p = candidate
return TrainingRecipe.from_yaml(p)