mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-19 01:07:18 +00:00
fix(debug): dumper now shows real stream + target flags
The dumper was printing ``stream=None target=None`` for every message because it read those fields off the message dicts, but the recipe renderer keeps them in parallel arrays (``message_streams`` / ``target_message_indices`` in COMPLEMENTARY_DATA) so the chat template doesn't see unknown keys. Zip them back into the dump-time dicts so the printed metadata is accurate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -297,8 +297,19 @@ class PI052TextTokenizerStep(ProcessorStep):
|
||||
)
|
||||
|
||||
if _DUMP_BUDGET > 0:
|
||||
# Stream / target metadata live in parallel arrays; zip them
|
||||
# back into the dicts so the dump shows them per message.
|
||||
target_set = {int(i) for i in target_indices}
|
||||
annotated_msgs = [
|
||||
{
|
||||
**m,
|
||||
"stream": message_streams[i] if i < len(message_streams) else None,
|
||||
"target": True if i in target_set else None,
|
||||
}
|
||||
for i, m in enumerate(messages)
|
||||
]
|
||||
_dump_recipe_sample(
|
||||
messages=messages,
|
||||
messages=annotated_msgs,
|
||||
prompt_text=prompt,
|
||||
token_ids=input_ids.tolist(),
|
||||
labels=labels.tolist(),
|
||||
|
||||
@@ -239,10 +239,35 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
|
||||
# model actually sees. No-op unless ``LEROBOT_DUMP_RECIPE_SAMPLES``
|
||||
# is set; stops globally after the budget is exhausted.
|
||||
if _DUMP_BUDGET > 0:
|
||||
msgs_iter = messages if _is_batched_messages(messages) else [messages]
|
||||
for msg, (ids, labels, predict_action) in zip(msgs_iter, encoded, strict=False):
|
||||
# Stream / target metadata live in parallel arrays in
|
||||
# COMPLEMENTARY_DATA, not on the message dicts themselves
|
||||
# (the recipe renderer keeps them separate so the chat
|
||||
# template doesn't choke on unknown keys). Zip them back
|
||||
# together for the dumper so each printed message shows
|
||||
# its actual stream + target flag.
|
||||
if _is_batched_messages(messages):
|
||||
msgs_iter = messages
|
||||
streams_iter = comp.get("message_streams") or [[] for _ in messages]
|
||||
targets_iter = comp.get("target_message_indices") or [[] for _ in messages]
|
||||
else:
|
||||
msgs_iter = [messages]
|
||||
streams_iter = [list(comp.get("message_streams") or [])]
|
||||
targets_iter = [list(comp.get("target_message_indices") or [])]
|
||||
for msg, streams, targets, (ids, labels, predict_action) in zip(
|
||||
msgs_iter, streams_iter, targets_iter, encoded, strict=False
|
||||
):
|
||||
target_set = {int(i) for i in targets}
|
||||
annotated_msgs = []
|
||||
for i, m in enumerate(msg):
|
||||
annotated_msgs.append(
|
||||
{
|
||||
**m,
|
||||
"stream": streams[i] if i < len(streams) else None,
|
||||
"target": True if i in target_set else None,
|
||||
}
|
||||
)
|
||||
_dump_recipe_sample(
|
||||
messages=msg,
|
||||
messages=annotated_msgs,
|
||||
token_ids=ids,
|
||||
labels=labels,
|
||||
predict_actions=predict_action,
|
||||
|
||||
Reference in New Issue
Block a user