fix(debug): dumper now shows real stream + target flags

The dumper was printing ``stream=None target=None`` for every message because it read those fields off the message dicts, but the recipe renderer keeps them in parallel arrays (``message_streams`` / ``target_message_indices`` in COMPLEMENTARY_DATA) so the chat template doesn't see unknown keys. Zip them back into the dump-time dicts so the printed metadata is accurate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-19 01:07:18 +00:00 · 2026-05-13 16:43:51 +02:00
parent 99ebee4d16
commit c3503b774f
2 changed files with 40 additions and 4 deletions
@@ -297,8 +297,19 @@ class PI052TextTokenizerStep(ProcessorStep):
        )

        if _DUMP_BUDGET > 0:
+            # Stream / target metadata live in parallel arrays; zip them
+            # back into the dicts so the dump shows them per message.
+            target_set = {int(i) for i in target_indices}
+            annotated_msgs = [
+                {
+                    **m,
+                    "stream": message_streams[i] if i < len(message_streams) else None,
+                    "target": True if i in target_set else None,
+                }
+                for i, m in enumerate(messages)
+            ]
            _dump_recipe_sample(
-                messages=messages,
+                messages=annotated_msgs,
                prompt_text=prompt,
                token_ids=input_ids.tolist(),
                labels=labels.tolist(),
@@ -239,10 +239,35 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
        # model actually sees. No-op unless ``LEROBOT_DUMP_RECIPE_SAMPLES``
        # is set; stops globally after the budget is exhausted.
        if _DUMP_BUDGET > 0:
-            msgs_iter = messages if _is_batched_messages(messages) else [messages]
-            for msg, (ids, labels, predict_action) in zip(msgs_iter, encoded, strict=False):
+            # Stream / target metadata live in parallel arrays in
+            # COMPLEMENTARY_DATA, not on the message dicts themselves
+            # (the recipe renderer keeps them separate so the chat
+            # template doesn't choke on unknown keys). Zip them back
+            # together for the dumper so each printed message shows
+            # its actual stream + target flag.
+            if _is_batched_messages(messages):
+                msgs_iter = messages
+                streams_iter = comp.get("message_streams") or [[] for _ in messages]
+                targets_iter = comp.get("target_message_indices") or [[] for _ in messages]
+            else:
+                msgs_iter = [messages]
+                streams_iter = [list(comp.get("message_streams") or [])]
+                targets_iter = [list(comp.get("target_message_indices") or [])]
+            for msg, streams, targets, (ids, labels, predict_action) in zip(
+                msgs_iter, streams_iter, targets_iter, encoded, strict=False
+            ):
+                target_set = {int(i) for i in targets}
+                annotated_msgs = []
+                for i, m in enumerate(msg):
+                    annotated_msgs.append(
+                        {
+                            **m,
+                            "stream": streams[i] if i < len(streams) else None,
+                            "target": True if i in target_set else None,
+                        }
+                    )
                _dump_recipe_sample(
-                    messages=msg,
+                    messages=annotated_msgs,
                    token_ids=ids,
                    labels=labels,
                    predict_actions=predict_action,