From c3503b774fc5d031ea6e4c1c0359dc3e1907ec0b Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 16:43:51 +0200 Subject: [PATCH] fix(debug): dumper now shows real stream + target flags The dumper was printing ``stream=None target=None`` for every message because it read those fields off the message dicts, but the recipe renderer keeps them in parallel arrays (``message_streams`` / ``target_message_indices`` in COMPLEMENTARY_DATA) so the chat template doesn't see unknown keys. Zip them back into the dump-time dicts so the printed metadata is accurate. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/pi052/text_processor_pi052.py | 13 +++++++- .../smolvla2/chat_processor_smolvla2.py | 31 +++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py index fddcba9df..c957cf590 100644 --- a/src/lerobot/policies/pi052/text_processor_pi052.py +++ b/src/lerobot/policies/pi052/text_processor_pi052.py @@ -297,8 +297,19 @@ class PI052TextTokenizerStep(ProcessorStep): ) if _DUMP_BUDGET > 0: + # Stream / target metadata live in parallel arrays; zip them + # back into the dicts so the dump shows them per message. + target_set = {int(i) for i in target_indices} + annotated_msgs = [ + { + **m, + "stream": message_streams[i] if i < len(message_streams) else None, + "target": True if i in target_set else None, + } + for i, m in enumerate(messages) + ] _dump_recipe_sample( - messages=messages, + messages=annotated_msgs, prompt_text=prompt, token_ids=input_ids.tolist(), labels=labels.tolist(), diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py index 23a5e5730..36fc02dca 100644 --- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py @@ -239,10 +239,35 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep): # model actually sees. No-op unless ``LEROBOT_DUMP_RECIPE_SAMPLES`` # is set; stops globally after the budget is exhausted. if _DUMP_BUDGET > 0: - msgs_iter = messages if _is_batched_messages(messages) else [messages] - for msg, (ids, labels, predict_action) in zip(msgs_iter, encoded, strict=False): + # Stream / target metadata live in parallel arrays in + # COMPLEMENTARY_DATA, not on the message dicts themselves + # (the recipe renderer keeps them separate so the chat + # template doesn't choke on unknown keys). Zip them back + # together for the dumper so each printed message shows + # its actual stream + target flag. + if _is_batched_messages(messages): + msgs_iter = messages + streams_iter = comp.get("message_streams") or [[] for _ in messages] + targets_iter = comp.get("target_message_indices") or [[] for _ in messages] + else: + msgs_iter = [messages] + streams_iter = [list(comp.get("message_streams") or [])] + targets_iter = [list(comp.get("target_message_indices") or [])] + for msg, streams, targets, (ids, labels, predict_action) in zip( + msgs_iter, streams_iter, targets_iter, encoded, strict=False + ): + target_set = {int(i) for i in targets} + annotated_msgs = [] + for i, m in enumerate(msg): + annotated_msgs.append( + { + **m, + "stream": streams[i] if i < len(streams) else None, + "target": True if i in target_set else None, + } + ) _dump_recipe_sample( - messages=msg, + messages=annotated_msgs, token_ids=ids, labels=labels, predict_actions=predict_action,