feat(smolvla2): chat-template processor + label mask + predict_actions

Wires PR 1's recipe stack into the SmolVLA2 pipeline so multi-target sub-recipes (memory_update, ask_vqa, user_interjection_response, high_level_subtask) carry meaningful supervision through to the model. - New ``chat_processor_smolvla2.py`` with ``SmolVLA2ChatTokenizerStep``: reads ``messages`` / ``message_streams`` / ``target_message_indices`` from the rendered sample (PR 1 ``RenderMessagesStep``), calls ``apply_chat_template(messages, tools=DEFAULT_TOOLS, ...)`` on the SmolVLM tokenizer, and writes: OBS_LANGUAGE_TOKENS / _ATTENTION_MASK ← chat-templated prompt text_labels ← -100 except target msg tokens predict_actions ← True iff any low_level target Builds the label mask robustly by re-rendering the chat through each target's prefix and reading off the prefix length — same tokenizer, same tools, so the prefix tokens are guaranteed to be a prefix of the full sequence. Image/video content blocks (LeRobot ``feature``-keyed) are stripped before tokenizing; the actual image tensors flow through SmolVLA's existing ``OBS_IMAGES_*`` channels and ``embed_prefix`` puts them before the language embeddings, matching the chat-template-stripped text order. - ``processor_smolvla2.py``: when ``config.recipe_path`` is set, build a new pipeline with ``RenderMessagesStep`` + ``SmolVLA2ChatTokenizerStep`` instead of SmolVLA's plain ``TokenizerProcessorStep``. When ``recipe_path`` is ``None``, fall back to SmolVLA's pipeline so unannotated datasets still work unchanged. Resolves recipe paths relative to ``src/lerobot/configs/`` so ``recipes/smolvla2_hirobot.yaml`` works directly. The next commit on this branch picks up ``text_labels`` and ``predict_actions`` from the batch and routes them through the SmolVLM ``lm_head`` for the actual dual-loss training. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 18:56:09 +00:00 · 2026-04-30 19:21:03 +02:00
parent 52e1fd35cb
commit 37b1eb218a
2 changed files with 366 additions and 19 deletions
@@ -0,0 +1,271 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SmolVLA2's chat-template tokenization step.
+
+Replaces SmolVLA's plain ``TokenizerProcessorStep`` for SmolVLA2 when a
+``recipe_path`` is set. Reads the rendered messages produced by
+``RenderMessagesStep`` (PR 1) and produces:
+
+* ``OBS_LANGUAGE_TOKENS`` / ``OBS_LANGUAGE_ATTENTION_MASK`` —
+  the chat-templated prompt tokenized by SmolVLM's tokenizer, with
+  ``tools=meta.tools`` (PR 1's catalog).
+* ``text_labels`` — same shape as token ids, ``-100`` everywhere except
+  the positions belonging to messages whose index is in
+  ``target_message_indices``. The next commit's modeling forward path
+  applies cross-entropy on those positions via the SmolVLM ``lm_head``.
+* ``predict_actions`` — bool tensor, ``True`` iff any of the rendered
+  target messages has ``message_streams[i] == "low_level"``. The
+  modeling forward uses this to gate the flow head.
+
+Image / video content blocks in the rendered messages are dropped
+before tokenization — the chat template only handles text, and SmolVLA
+already passes camera tensors out-of-band via the standard
+``OBS_IMAGES_*`` features. This keeps the prefix layout unchanged
+(``embed_prefix`` puts image embeddings before language embeddings,
+matching the chat-template-stripped text order).
+"""
+
+from __future__ import annotations
+
+import copy
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.datasets.language import DEFAULT_TOOLS
+from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="smolvla2_chat_tokenizer")
+class SmolVLA2ChatTokenizerStep(ProcessorStep):
+    """Render messages → token ids + label mask + predict_actions flag.
+
+    This is the bridge between the recipe stack (PR 1's
+    ``RenderMessagesStep`` outputs) and the SmolVLA2 modeling forward
+    (next commit, which reads ``text_labels`` / ``predict_actions``).
+    Pure-text turns and multi-stream targets are both handled.
+    """
+
+    tokenizer_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+    max_length: int = 2048
+    padding: str = "longest"
+    padding_side: str = "right"
+    tools: list[dict[str, Any]] | None = None
+
+    def __post_init__(self) -> None:
+        # Lazy: don't load the tokenizer until the step actually runs,
+        # so unit tests that import the module without transformers
+        # installed still pass.
+        self._tokenizer: Any = None
+        if self.tools is None:
+            # Default: ship the canonical ``say`` schema. Users who set
+            # ``meta.tools`` differently can override via
+            # ``with_tools(meta.tools)``.
+            self.tools = list(DEFAULT_TOOLS)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def with_tools(self, tools: list[dict[str, Any]]) -> "SmolVLA2ChatTokenizerStep":
+        """Override the tools catalog rendered into the system prompt."""
+        self.tools = list(tools)
+        return self
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition | None:
+        comp = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        messages = comp.get("messages")
+        if not messages:
+            # No recipe rendering happened — nothing to do; downstream
+            # falls back to whatever ``task`` is in the transition.
+            return transition
+
+        message_streams: list[str | None] = list(comp.get("message_streams") or [])
+        target_indices: list[int] = sorted(
+            int(i) for i in (comp.get("target_message_indices") or [])
+        )
+
+        tokenizer = self._get_tokenizer()
+        text_messages = [_strip_lerobot_blocks(m) for m in messages]
+
+        # Tokenize the full chat once.
+        full_ids = tokenizer.apply_chat_template(
+            text_messages,
+            tools=self.tools,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_tensors=None,
+        )
+        if isinstance(full_ids, list) and full_ids and isinstance(full_ids[0], list):
+            full_ids = full_ids[0]
+
+        # Build the label mask by re-rendering progressively up to each
+        # target message and reading off the prefix length. This is the
+        # robust way to get exact token boundaries: we use the same
+        # tokenizer, the same ``tools=`` argument, and the same chat
+        # template — so the prefix tokens are guaranteed to be a prefix
+        # of the full sequence.
+        labels = [-100] * len(full_ids)
+        for tgt in target_indices:
+            prefix_ids = tokenizer.apply_chat_template(
+                text_messages[:tgt],
+                tools=self.tools,
+                add_generation_prompt=False,
+                tokenize=True,
+                return_tensors=None,
+            )
+            full_through_target = tokenizer.apply_chat_template(
+                text_messages[: tgt + 1],
+                tools=self.tools,
+                add_generation_prompt=False,
+                tokenize=True,
+                return_tensors=None,
+            )
+            if isinstance(prefix_ids, list) and prefix_ids and isinstance(prefix_ids[0], list):
+                prefix_ids = prefix_ids[0]
+            if (
+                isinstance(full_through_target, list)
+                and full_through_target
+                and isinstance(full_through_target[0], list)
+            ):
+                full_through_target = full_through_target[0]
+            start = len(prefix_ids)
+            end = min(len(full_through_target), len(full_ids))
+            for pos in range(start, end):
+                labels[pos] = int(full_ids[pos])
+
+        # Truncate / pad to ``max_length`` so batches collate cleanly.
+        # The SmolVLA pipeline downstream relies on a fixed length
+        # behaviour ("longest" or "max_length") — we mirror it here.
+        if len(full_ids) > self.max_length:
+            full_ids = full_ids[: self.max_length]
+            labels = labels[: self.max_length]
+        attn = [1] * len(full_ids)
+        if self.padding == "max_length" and len(full_ids) < self.max_length:
+            pad_id = (
+                tokenizer.pad_token_id
+                if tokenizer.pad_token_id is not None
+                else 0
+            )
+            n_pad = self.max_length - len(full_ids)
+            full_ids = full_ids + [pad_id] * n_pad
+            labels = labels + [-100] * n_pad
+            attn = attn + [0] * n_pad
+
+        ids_t = torch.tensor(full_ids, dtype=torch.long)
+        attn_t = torch.tensor(attn, dtype=torch.bool)
+        labels_t = torch.tensor(labels, dtype=torch.long)
+        predict_actions = any(
+            i < len(message_streams) and message_streams[i] == "low_level"
+            for i in target_indices
+        )
+
+        new_complementary = dict(comp)
+        # Drop the per-recipe sidecar keys; everything downstream needs
+        # is now in the tokenized form.
+        new_complementary.pop("messages", None)
+        new_complementary.pop("message_streams", None)
+        new_complementary.pop("target_message_indices", None)
+        # SmolVLA's pipeline expects ``OBS_LANGUAGE_TOKENS`` /
+        # ``OBS_LANGUAGE_ATTENTION_MASK`` on the OBSERVATION key. Place
+        # them there — and drop ``task`` so the upstream
+        # ``TokenizerProcessorStep`` (which we replace) doesn't double-
+        # tokenize.
+        observation = dict(transition.get(TransitionKey.OBSERVATION) or {})
+        observation[OBS_LANGUAGE_TOKENS] = ids_t
+        observation[OBS_LANGUAGE_ATTENTION_MASK] = attn_t
+        new_complementary["text_labels"] = labels_t
+        new_complementary["predict_actions"] = torch.tensor(predict_actions, dtype=torch.bool)
+        new_complementary.pop("task", None)
+
+        new_transition = dict(transition)
+        new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary
+        new_transition[TransitionKey.OBSERVATION] = observation
+        return new_transition
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        """Pass-through; this step writes runtime tensors not features."""
+        return features
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _get_tokenizer(self):  # noqa: ANN202
+        if self._tokenizer is not None:
+            return self._tokenizer
+        try:
+            from transformers import AutoTokenizer  # noqa: PLC0415
+        except ImportError as exc:  # pragma: no cover
+            raise ImportError(
+                "SmolVLA2ChatTokenizerStep requires transformers. "
+                "`pip install lerobot[transformers-dep]`."
+            ) from exc
+        self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
+        if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None:
+            self._tokenizer.pad_token = self._tokenizer.eos_token
+        return self._tokenizer
+
+
+def _strip_lerobot_blocks(message: dict[str, Any]) -> dict[str, Any]:
+    """Remove LeRobot-specific multimodal blocks from ``message`` content.
+
+    The recipe DSL allows authors to write multimodal content like
+    ``{"type": "image", "feature": "observation.images.top"}``. SmolVLM's
+    tokenizer doesn't know that ``feature`` key (it expects ``url`` or
+    ``path``). The actual image tensor flows through SmolVLA's
+    ``OBS_IMAGES_*`` channels separately; the chat template only needs
+    the text. So we strip non-text blocks before tokenizing.
+    """
+    new = dict(message)
+    content = new.get("content")
+    if isinstance(content, list):
+        text_parts: list[dict[str, Any]] = []
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            if block.get("type") == "text":
+                text_parts.append({"type": "text", "text": str(block.get("text", ""))})
+        # If only one text block survives, flatten to a string for
+        # template friendliness; some chat templates choke on a single-
+        # element list.
+        if len(text_parts) == 1:
+            new["content"] = text_parts[0]["text"]
+        elif text_parts:
+            new["content"] = text_parts
+        else:
+            new["content"] = ""
+    if "tool_calls" in new and not new["tool_calls"]:
+        # Drop empty tool_calls — some templates render them as a
+        # spurious empty marker.
+        new.pop("tool_calls")
+    # ``stream`` and ``target`` were recipe metadata; templates don't
+    # know them and may warn or crash.
+    new.pop("stream", None)
+    new.pop("target", None)
+    return new
+
+
+# Re-export for tests / introspection
+strip_lerobot_blocks = _strip_lerobot_blocks
@@ -13,43 +13,119 @@
 # limitations under the License.
 """SmolVLA2 processor pipelines.

-SCAFFOLD: this currently delegates to SmolVLA's processor. The next
-commit on this branch replaces that with a chat-template aware pipeline:
+When ``config.recipe_path`` is set, the pre-processor pipeline becomes:

-  RenderMessagesStep (PR1) → SmolVLA2ChatTokenizerStep → existing SmolVLA
-  normalization / device steps.
+    rename observations
+    add batch dim
+    RenderMessagesStep(recipe)              # PR 1: language_*  → messages
+    SmolVLA2ChatTokenizerStep(...)          # chat template + label mask + predict_actions
+    DeviceProcessorStep
+    NormalizerProcessorStep

-The chat tokenizer step will:
+When ``config.recipe_path`` is ``None``, we delegate to SmolVLA's
+plain task-string pipeline so unannotated datasets still work.

-* take ``messages`` / ``message_streams`` / ``target_message_indices``
-  from the rendered sample,
-* call ``apply_chat_template(messages, tools=DEFAULT_TOOLS, ...)`` on the
-  SmolVLM tokenizer,
-* tokenize the resulting prompt,
-* build a ``text_labels`` tensor with ``-100`` everywhere except the
-  token positions belonging to messages whose index is in
-  ``target_message_indices``,
-* derive ``predict_actions = bool(targets_by_stream.get("low_level"))``.
+Post-processor is unchanged from SmolVLA.
 """

 from __future__ import annotations

+from pathlib import Path
 from typing import Any

 import torch

+from lerobot.configs.recipe import TrainingRecipe
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    NormalizerProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    RenameObservationsProcessorStep,
+    RenderMessagesStep,
+    UnnormalizerProcessorStep,
+    policy_action_to_transition,
+    transition_to_policy_action,
+)
+from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
+
 from ..smolvla.processor_smolvla import make_smolvla_pre_post_processors
+from .chat_processor_smolvla2 import SmolVLA2ChatTokenizerStep
 from .configuration_smolvla2 import SmolVLA2Config


 def make_smolvla2_pre_post_processors(
    config: SmolVLA2Config,
    dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
-) -> tuple[Any, Any]:
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
    """Build SmolVLA2's pre/post-processor pipelines.

-    SCAFFOLD: just delegates to ``make_smolvla_pre_post_processors`` so
-    SmolVLA2 inherits SmolVLA's tokenization + normalization for now.
-    The recipe-driven chat-template rendering arrives in the next commit.
+    With ``recipe_path`` set, inserts the recipe-rendering step and the
+    chat-template tokenizer that emits ``text_labels`` and
+    ``predict_actions`` for the dual-loss path. Without it, falls back
+    to SmolVLA's plain task-string pipeline so unannotated datasets
+    keep working unchanged.
    """
-    return make_smolvla_pre_post_processors(config, dataset_stats=dataset_stats)
+    if not config.recipe_path:
+        return make_smolvla_pre_post_processors(config, dataset_stats=dataset_stats)
+
+    recipe = _load_recipe(config.recipe_path)
+
+    input_steps = [
+        RenameObservationsProcessorStep(rename_map={}),
+        AddBatchDimensionProcessorStep(),
+        RenderMessagesStep(recipe=recipe),
+        SmolVLA2ChatTokenizerStep(
+            tokenizer_name=config.vlm_model_name,
+            max_length=config.tokenizer_max_length,
+            padding=config.pad_language_to,
+        ),
+        DeviceProcessorStep(device=config.device),
+        NormalizerProcessorStep(
+            features={**config.input_features, **config.output_features},
+            norm_map=config.normalization_mapping,
+            stats=dataset_stats,
+        ),
+    ]
+    output_steps = [
+        UnnormalizerProcessorStep(
+            features=config.output_features,
+            norm_map=config.normalization_mapping,
+            stats=dataset_stats,
+        ),
+        DeviceProcessorStep(device="cpu"),
+    ]
+    return (
+        PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+            steps=input_steps,
+            name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+        ),
+        PolicyProcessorPipeline[PolicyAction, PolicyAction](
+            steps=output_steps,
+            name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+            to_transition=policy_action_to_transition,
+            to_output=transition_to_policy_action,
+        ),
+    )
+
+
+def _load_recipe(path_str: str) -> TrainingRecipe:
+    """Resolve ``path_str`` to a ``TrainingRecipe``.
+
+    Accepts an absolute path or a path relative to
+    ``src/lerobot/configs/`` so recipe authors can write
+    ``--policy.recipe_path=recipes/smolvla2_hirobot.yaml``.
+    """
+    p = Path(path_str)
+    if not p.is_absolute() and not p.exists():
+        from lerobot.configs import recipe as _recipe_module  # noqa: PLC0415
+
+        configs_dir = Path(_recipe_module.__file__).resolve().parent
+        candidate = configs_dir / path_str
+        if candidate.exists():
+            p = candidate
+    return TrainingRecipe.from_yaml(p)