fix(debug): dumper now shows real stream + target flags

The dumper was printing ``stream=None target=None`` for every
message because it read those fields off the message dicts, but
the recipe renderer keeps them in parallel arrays
(``message_streams`` / ``target_message_indices`` in
COMPLEMENTARY_DATA) so the chat template doesn't see unknown
keys. Zip them back into the dump-time dicts so the printed
metadata is accurate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-13 16:43:51 +02:00
parent 99ebee4d16
commit c3503b774f
2 changed files with 40 additions and 4 deletions
@@ -297,8 +297,19 @@ class PI052TextTokenizerStep(ProcessorStep):
)
if _DUMP_BUDGET > 0:
# Stream / target metadata live in parallel arrays; zip them
# back into the dicts so the dump shows them per message.
target_set = {int(i) for i in target_indices}
annotated_msgs = [
{
**m,
"stream": message_streams[i] if i < len(message_streams) else None,
"target": True if i in target_set else None,
}
for i, m in enumerate(messages)
]
_dump_recipe_sample(
messages=messages,
messages=annotated_msgs,
prompt_text=prompt,
token_ids=input_ids.tolist(),
labels=labels.tolist(),
@@ -239,10 +239,35 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
# model actually sees. No-op unless ``LEROBOT_DUMP_RECIPE_SAMPLES``
# is set; stops globally after the budget is exhausted.
if _DUMP_BUDGET > 0:
msgs_iter = messages if _is_batched_messages(messages) else [messages]
for msg, (ids, labels, predict_action) in zip(msgs_iter, encoded, strict=False):
# Stream / target metadata live in parallel arrays in
# COMPLEMENTARY_DATA, not on the message dicts themselves
# (the recipe renderer keeps them separate so the chat
# template doesn't choke on unknown keys). Zip them back
# together for the dumper so each printed message shows
# its actual stream + target flag.
if _is_batched_messages(messages):
msgs_iter = messages
streams_iter = comp.get("message_streams") or [[] for _ in messages]
targets_iter = comp.get("target_message_indices") or [[] for _ in messages]
else:
msgs_iter = [messages]
streams_iter = [list(comp.get("message_streams") or [])]
targets_iter = [list(comp.get("target_message_indices") or [])]
for msg, streams, targets, (ids, labels, predict_action) in zip(
msgs_iter, streams_iter, targets_iter, encoded, strict=False
):
target_set = {int(i) for i in targets}
annotated_msgs = []
for i, m in enumerate(msg):
annotated_msgs.append(
{
**m,
"stream": streams[i] if i < len(streams) else None,
"target": True if i in target_set else None,
}
)
_dump_recipe_sample(
messages=msg,
messages=annotated_msgs,
token_ids=ids,
labels=labels,
predict_actions=predict_action,