mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 03:59:42 +00:00
fix(pi052): VQA target format = "label <loc><loc>" not "<loc><loc> label"
The trained model collapsed to spewing 40+ <loc> tokens for *every* prompt — subtask, memory, anything — because VQA targets were supervised to *start* with <loc>. With ~25% of all text samples beginning with a <loc> token, the LM head learned "Assistant: → <loc>" as a strong attractor; once one loc is emitted, autoregression chains the rest. Flip the format so every text target — subtask, memory, speech, AND VQA — starts with a regular word. The model still learns the <loc> vocabulary for the spatial portion of the answer, but loc can no longer be the first generation step out of a clean prompt. Examples: point : "green box <loc0162><loc0759>" bbox : "cube <loc0082>…<loc0409>" multi : "blue <locs> ; yellow <locs>" The runtime parser (parse_loc_answer) strips loc tokens and uses the remainder as label, so it's order-tolerant and works under either format. Old loc-first checkpoints still parse cleanly at inference; new training will use label-first. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -89,16 +89,31 @@ def test_loc_token_normalizes_and_clamps():
|
||||
|
||||
|
||||
def test_vqa_answer_to_loc_keypoint_normalized():
|
||||
# Qwen 0–1000 normalized coordinates → camera-independent <loc>.
|
||||
# Label-first: avoids the "Assistant: → <loc>" attractor at training.
|
||||
answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
|
||||
assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
|
||||
assert _vqa_answer_to_loc(answer) == "blue cube <loc0512><loc0512>"
|
||||
|
||||
|
||||
def test_vqa_answer_to_loc_bbox_normalized():
|
||||
answer = {
|
||||
"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
|
||||
}
|
||||
assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
|
||||
assert _vqa_answer_to_loc(answer) == "cube <loc0000><loc0000><loc1023><loc1023>"
|
||||
|
||||
|
||||
def test_vqa_answer_to_loc_multiple_detections_separator():
|
||||
answer = {
|
||||
"detections": [
|
||||
{"label": "blue", "bbox_format": "xyxy", "bbox": [0, 0, 500, 500]},
|
||||
{"label": "yellow", "bbox_format": "xyxy", "bbox": [500, 500, 1000, 1000]},
|
||||
]
|
||||
}
|
||||
out = _vqa_answer_to_loc(answer)
|
||||
# Each segment is "label <locs>", joined by " ; "
|
||||
assert out == (
|
||||
"blue <loc0000><loc0000><loc0512><loc0512> ; "
|
||||
"yellow <loc0512><loc0512><loc1023><loc1023>"
|
||||
)
|
||||
|
||||
|
||||
def test_vqa_answer_to_loc_returns_none_for_non_spatial():
|
||||
@@ -115,7 +130,7 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
|
||||
},
|
||||
]
|
||||
out = _messages_vqa_to_loc(messages, target_indices=[1])
|
||||
assert out[1]["content"] == "<loc0512><loc0512> cube"
|
||||
assert out[1]["content"] == "cube <loc0512><loc0512>"
|
||||
# input messages are not mutated
|
||||
assert messages[1]["content"].startswith("{")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user