fix(pi052): handle batched rendered messages

Tokenize batched recipe outputs in PI052 so training batches with nested message lists do not crash before model forward. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-07-24 18:26:11 +00:00 · 2026-05-18 17:41:58 +00:00
parent 0e2dc1b76f
commit 1750a87104
2 changed files with 176 additions and 38 deletions
@@ -42,6 +42,7 @@ from dataclasses import dataclass
 from typing import Any
 import torch
 from torch import Tensor
 from lerobot.configs import PipelineFeatureType, PolicyFeature
 from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry
@@ -214,6 +215,25 @@ def _strip_blocks(message: dict[str, Any]) -> dict[str, Any]:
    return new
 def _is_batched_messages(messages: Any) -> bool:
    return isinstance(messages, list) and bool(messages) and isinstance(messages[0], list)
 def _sample_indices(value: Any, batch_size: int) -> list[int | None]:
    if value is None:
        return [None] * batch_size
    if isinstance(value, torch.Tensor):
        if value.numel() == 1:
            return [int(value.item())] * batch_size
        values = value.reshape(-1).tolist()
        return [int(v) for v in values[:batch_size]]
    if isinstance(value, (list, tuple)):
        if len(value) == 1:
            return _sample_indices(value[0], batch_size)
        return [int(v.item() if hasattr(v, "item") else v) for v in value[:batch_size]]
    return [int(value)] * batch_size
 def _format_messages(messages: list[dict[str, Any]]) -> tuple[str, list[tuple[int, int]]]:
    """Concatenate messages into the π0.5-style flat prompt.
@@ -285,8 +305,6 @@ class PI052TextTokenizerStep(ProcessorStep):
        transition = transition.copy()
        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA, {}) or {}
        messages = complementary.get("messages") or []
        target_indices = list(complementary.get("target_message_indices") or [])
        message_streams = list(complementary.get("message_streams") or [])
        if not messages:
            # No recipe was rendered — caller will fall back to the
@@ -294,6 +312,90 @@ class PI052TextTokenizerStep(ProcessorStep):
            # unmodified.
            return transition
        tokenizer = self._ensure_tokenizer()
        if _is_batched_messages(messages):
            indices_iter = _sample_indices(complementary.get("index"), len(messages))
            encoded = [
                self._encode_messages(
                    tokenizer,
                    msg,
                    list(streams),
                    list(tgt_indices),
                    complementary,
                    sample_idx=int(s_idx) if s_idx is not None else None,
                )
                for msg, streams, tgt_indices, s_idx in zip(
                    messages,
                    complementary.get("message_streams") or [[] for _ in messages],
                    complementary.get("target_message_indices") or [[] for _ in messages],
                    indices_iter,
                    strict=False,
                )
            ]
        else:
            sample_idx = _sample_indices(complementary.get("index"), 1)[0]
            encoded = [
                self._encode_messages(
                    tokenizer,
                    messages,
                    list(complementary.get("message_streams") or []),
                    list(complementary.get("target_message_indices") or []),
                    complementary,
                    sample_idx=sample_idx,
                )
            ]
        if _DUMP_BUDGET > 0:
            if _is_batched_messages(messages):
                msgs_iter = messages
                streams_iter = complementary.get("message_streams") or [[] for _ in messages]
                targets_iter = complementary.get("target_message_indices") or [[] for _ in messages]
            else:
                msgs_iter = [messages]
                streams_iter = [list(complementary.get("message_streams") or [])]
                targets_iter = [list(complementary.get("target_message_indices") or [])]
            for msg, streams, targets, (ids, attn, labels, predict_action, prompt) in zip(
                msgs_iter, streams_iter, targets_iter, encoded, strict=False
            ):
                target_set = {int(i) for i in targets}
                annotated_msgs = [
                    {
                        **m,
                        "stream": streams[i] if i < len(streams) else None,
                        "target": True if i in target_set else None,
                    }
                    for i, m in enumerate(msg)
                ]
                _dump_recipe_sample(
                    messages=annotated_msgs,
                    prompt_text=prompt,
                    token_ids=ids.tolist(),
                    labels=labels.tolist(),
                    predict_actions=bool(predict_action.item()),
                    tokenizer=tokenizer,
                )
        obs = dict(transition.get(TransitionKey.OBSERVATION) or {})
        obs[OBS_LANGUAGE_TOKENS] = torch.stack([ids for ids, _, _, _, _ in encoded])
        obs[OBS_LANGUAGE_ATTENTION_MASK] = torch.stack([attn for _, attn, _, _, _ in encoded])
        transition[TransitionKey.OBSERVATION] = obs
        transition[TransitionKey.COMPLEMENTARY_DATA] = {
            **complementary,
            "text_labels": torch.stack([labels for _, _, labels, _, _ in encoded]),
            "predict_actions": torch.stack([pred for _, _, _, pred, _ in encoded]),
        }
        return transition
    def _encode_messages(
        self,
        tokenizer: Any,
        messages: list[dict[str, Any]],
        message_streams: list[str | None],
        target_indices: list[int],
        complementary: dict[str, Any],
        sample_idx: int | None = None,
    ) -> tuple[Tensor, Tensor, Tensor, Tensor, str]:
        # Optional: drop non-target messages per the dropout config.
        # Keeps the supervised-target indices stable by re-mapping
        # after removal.
@@ -307,6 +409,7 @@ class PI052TextTokenizerStep(ProcessorStep):
                messages,
                target_indices,
                complementary,
                sample_idx=sample_idx,
            )
        # Flatten ``say`` tool calls into ``<say>...</say>`` text before
@@ -315,7 +418,6 @@ class PI052TextTokenizerStep(ProcessorStep):
        messages = [_strip_blocks(_flatten_say_tool_calls(m)) for m in messages]
        prompt, spans = _format_messages(messages)
        tokenizer = self._ensure_tokenizer()
        encoded = tokenizer(
            prompt,
            max_length=self.max_length,
@@ -354,39 +456,7 @@ class PI052TextTokenizerStep(ProcessorStep):
            bool(any(s == "low_level" for s in message_streams)),
            dtype=torch.bool,
        )
-
+        return input_ids, attention_mask, labels, predict_actions, prompt
        if _DUMP_BUDGET > 0:
            # Stream / target metadata live in parallel arrays; zip them
            # back into the dicts so the dump shows them per message.
            target_set = {int(i) for i in target_indices}
            annotated_msgs = [
                {
                    **m,
                    "stream": message_streams[i] if i < len(message_streams) else None,
                    "target": True if i in target_set else None,
                }
                for i, m in enumerate(messages)
            ]
            _dump_recipe_sample(
                messages=annotated_msgs,
                prompt_text=prompt,
                token_ids=input_ids.tolist(),
                labels=labels.tolist(),
                predict_actions=bool(predict_actions.item()),
                tokenizer=tokenizer,
            )
        obs = dict(transition.get(TransitionKey.OBSERVATION) or {})
        obs[OBS_LANGUAGE_TOKENS] = input_ids.unsqueeze(0)
        obs[OBS_LANGUAGE_ATTENTION_MASK] = attention_mask.unsqueeze(0)
        transition[TransitionKey.OBSERVATION] = obs
        transition[TransitionKey.COMPLEMENTARY_DATA] = {
            **complementary,
            "text_labels": labels.unsqueeze(0),
            "predict_actions": predict_actions.unsqueeze(0),
        }
        return transition
    # ------------------------------------------------------------------
    # Per-component prompt dropout (Pi0.7 §V.E)
@@ -397,6 +467,7 @@ class PI052TextTokenizerStep(ProcessorStep):
        messages: list[dict[str, Any]],
        target_indices: list[int],
        complementary: dict[str, Any],
        sample_idx: int | None = None,
    ) -> tuple[list[dict[str, Any]], list[int]]:
        """Drop messages classified as plan/memory/subtask context.
@@ -411,7 +482,7 @@ class PI052TextTokenizerStep(ProcessorStep):
            # ``render_messages_processor``. Falling back to other
            # keys silently gave every sample seed=0 → identical
            # dropout pattern across the whole epoch.
-            seed_src = complementary.get("index", 0)
+            seed_src = sample_idx if sample_idx is not None else complementary.get("index", 0)
            try:
                if hasattr(seed_src, "item"):
                    seed_src = seed_src.item()
@@ -21,7 +21,11 @@ PaliGemma's flat prompt has no structured tool calls, so an assistant
 marker — otherwise the spoken reply is dropped and never supervised.
 """
-from lerobot.policies.pi052.text_processor_pi052 import _flatten_say_tool_calls
+import torch
 from lerobot.policies.pi052.text_processor_pi052 import PI052TextTokenizerStep, _flatten_say_tool_calls
 from lerobot.types import TransitionKey
 from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
 def _say_call(text):
@@ -58,3 +62,66 @@ def test_flatten_drops_non_say_tool_calls_but_keeps_content():
    )
    assert out["content"] == "plan only"
    assert "tool_calls" not in out
 class _CharTokenizer:
    pad_token_id = 0
    def __call__(
        self,
        text,
        max_length,
        padding,
        truncation,
        return_tensors,
        return_offsets_mapping,
        padding_side,
    ):
        ids = [ord(c) % 251 + 1 for c in text[:max_length]]
        offsets = [(i, i + 1) for i in range(len(ids))]
        attention = [1] * len(ids)
        if padding == "max_length" and len(ids) < max_length:
            pad = max_length - len(ids)
            ids += [self.pad_token_id] * pad
            offsets += [(0, 0)] * pad
            attention += [0] * pad
        return {
            "input_ids": torch.tensor([ids], dtype=torch.long),
            "attention_mask": torch.tensor([attention], dtype=torch.long),
            "offset_mapping": torch.tensor([offsets], dtype=torch.long),
        }
    def decode(self, token_ids, skip_special_tokens=False):
        return "".join(chr(max(int(i) - 1, 0)) for i in token_ids if int(i) != self.pad_token_id)
 def test_pi052_text_tokenizer_handles_batched_rendered_messages():
    step = PI052TextTokenizerStep(max_length=64)
    step._tokenizer = _CharTokenizer()
    transition = {
        TransitionKey.OBSERVATION: {},
        TransitionKey.COMPLEMENTARY_DATA: {
            "messages": [
                [
                    {"role": "user", "content": "pick cube"},
                    {"role": "assistant", "content": "move to cube"},
                ],
                [{"role": "user", "content": "open drawer"}],
            ],
            "target_message_indices": [[1], []],
            "message_streams": [["high_level", "high_level"], ["low_level"]],
            "index": torch.tensor([10, 11]),
        },
    }
    out = step(transition)
    obs = out[TransitionKey.OBSERVATION]
    comp = out[TransitionKey.COMPLEMENTARY_DATA]
    assert obs[OBS_LANGUAGE_TOKENS].shape == (2, 64)
    assert obs[OBS_LANGUAGE_ATTENTION_MASK].shape == (2, 64)
    assert comp["text_labels"].shape == (2, 64)
    assert comp["predict_actions"].tolist() == [False, True]
    assert (comp["text_labels"][0] != -100).any()
    assert not (comp["text_labels"][1] != -100).any()