mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 12:09:42 +00:00
fix(pi052): VQA target format = "label <loc><loc>" not "<loc><loc> label"
The trained model collapsed to spewing 40+ <loc> tokens for *every* prompt — subtask, memory, anything — because VQA targets were supervised to *start* with <loc>. With ~25% of all text samples beginning with a <loc> token, the LM head learned "Assistant: → <loc>" as a strong attractor; once one loc is emitted, autoregression chains the rest. Flip the format so every text target — subtask, memory, speech, AND VQA — starts with a regular word. The model still learns the <loc> vocabulary for the spatial portion of the answer, but loc can no longer be the first generation step out of a clean prompt. Examples: point : "green box <loc0162><loc0759>" bbox : "cube <loc0082>…<loc0409>" multi : "blue <locs> ; yellow <locs>" The runtime parser (parse_loc_answer) strips loc tokens and uses the remainder as label, so it's order-tolerant and works under either format. Old loc-first checkpoints still parse cleanly at inference; new training will use label-first. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -284,11 +284,23 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
|
|||||||
"""Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
|
"""Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
|
||||||
|
|
||||||
Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see
|
Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see
|
||||||
module-level note). PaliGemma convention: a point is
|
module-level note). y is emitted before x for each coordinate pair
|
||||||
``<locY><locX> label``; a box is ``<locY0><locX0><locY1><locX1> label``
|
(PaliGemma convention), with the integer indices in [0, 1023].
|
||||||
(y before x, each index in [0, 1023]). Returns ``None`` for
|
|
||||||
non-spatial answers (count / attribute / spatial-relation) — those
|
**Format: label first, locs after.** PaliGemma's pretraining puts
|
||||||
keep their JSON form.
|
locs first (``<loc><loc> label``), but for our small-dataset VQA
|
||||||
|
blend that turns the LM head into a loc-emission attractor at every
|
||||||
|
``Assistant:`` position — VQA targets share their first supervised
|
||||||
|
token with ~25% of all text samples, and the head collapses to
|
||||||
|
emitting ``<loc>`` regardless of the prompt. Putting the label
|
||||||
|
first (``label <locY><locX>``) means every text sample (subtask,
|
||||||
|
memory, VQA, …) starts the supervised target with a real word,
|
||||||
|
breaking the attractor. The model still learns the loc vocabulary
|
||||||
|
for the *spatial* portion of the answer; it just can't fire it as
|
||||||
|
the first generation step from a clean prompt.
|
||||||
|
|
||||||
|
Returns ``None`` for non-spatial answers (count / attribute /
|
||||||
|
spatial-relation) — those keep their JSON form.
|
||||||
"""
|
"""
|
||||||
point = answer.get("point")
|
point = answer.get("point")
|
||||||
if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
|
if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
|
||||||
@@ -297,7 +309,9 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
return None
|
return None
|
||||||
label = str(answer.get("label", "")).strip()
|
label = str(answer.get("label", "")).strip()
|
||||||
return f"{_loc_token(y)}{_loc_token(x)} {label}".strip()
|
if not label:
|
||||||
|
return None
|
||||||
|
return f"{label} {_loc_token(y)}{_loc_token(x)}"
|
||||||
|
|
||||||
detections = answer.get("detections")
|
detections = answer.get("detections")
|
||||||
if isinstance(detections, list) and detections:
|
if isinstance(detections, list) and detections:
|
||||||
@@ -313,11 +327,13 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
continue
|
continue
|
||||||
label = str(det.get("label", "")).strip()
|
label = str(det.get("label", "")).strip()
|
||||||
|
if not label:
|
||||||
|
continue
|
||||||
toks = (
|
toks = (
|
||||||
f"{_loc_token(y1)}{_loc_token(x1)}"
|
f"{_loc_token(y1)}{_loc_token(x1)}"
|
||||||
f"{_loc_token(y2)}{_loc_token(x2)}"
|
f"{_loc_token(y2)}{_loc_token(x2)}"
|
||||||
)
|
)
|
||||||
parts.append(f"{toks} {label}".strip())
|
parts.append(f"{label} {toks}")
|
||||||
return " ; ".join(parts) if parts else None
|
return " ; ".join(parts) if parts else None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -133,10 +133,14 @@ def parse_loc_answer(answer: str) -> dict | None:
|
|||||||
"""Parse a PaliGemma ``<loc>``-format spatial VQA answer.
|
"""Parse a PaliGemma ``<loc>``-format spatial VQA answer.
|
||||||
|
|
||||||
PI052 trains spatial answers in PaliGemma's native detection
|
PI052 trains spatial answers in PaliGemma's native detection
|
||||||
vocabulary: a point is ``<locY><locX> label``, a box is
|
vocabulary, label-first: a point is ``<label> <locY><locX>``, a box
|
||||||
``<locY0><locX0><locY1><locX1> label``, and multiple boxes are joined
|
is ``<label> <locY0><locX0><locY1><locX1>``, and multiple boxes are
|
||||||
by `` ; ``. Coordinates come back *normalized* ([0, 1]); the overlay
|
joined by `` ; `` (e.g. ``cube <loc..><loc..><loc..><loc..> ; box
|
||||||
denormalizes them against the chosen camera frame's pixel size.
|
<loc..><loc..><loc..><loc..>``). Loc-first formats are also accepted
|
||||||
|
— this parser strips loc tokens and treats the remainder as the
|
||||||
|
label, so order is irrelevant. Coordinates come back *normalized*
|
||||||
|
([0, 1]); the overlay denormalizes them against the chosen camera
|
||||||
|
frame's pixel size.
|
||||||
|
|
||||||
Returns ``{"kind", "payload", "normalized": True}`` on success
|
Returns ``{"kind", "payload", "normalized": True}`` on success
|
||||||
(``payload`` mirrors the JSON shapes so the overlay code is shared),
|
(``payload`` mirrors the JSON shapes so the overlay code is shared),
|
||||||
|
|||||||
@@ -89,16 +89,31 @@ def test_loc_token_normalizes_and_clamps():
|
|||||||
|
|
||||||
|
|
||||||
def test_vqa_answer_to_loc_keypoint_normalized():
|
def test_vqa_answer_to_loc_keypoint_normalized():
|
||||||
# Qwen 0–1000 normalized coordinates → camera-independent <loc>.
|
# Label-first: avoids the "Assistant: → <loc>" attractor at training.
|
||||||
answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
|
answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
|
||||||
assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
|
assert _vqa_answer_to_loc(answer) == "blue cube <loc0512><loc0512>"
|
||||||
|
|
||||||
|
|
||||||
def test_vqa_answer_to_loc_bbox_normalized():
|
def test_vqa_answer_to_loc_bbox_normalized():
|
||||||
answer = {
|
answer = {
|
||||||
"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
|
"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
|
||||||
}
|
}
|
||||||
assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
|
assert _vqa_answer_to_loc(answer) == "cube <loc0000><loc0000><loc1023><loc1023>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_vqa_answer_to_loc_multiple_detections_separator():
|
||||||
|
answer = {
|
||||||
|
"detections": [
|
||||||
|
{"label": "blue", "bbox_format": "xyxy", "bbox": [0, 0, 500, 500]},
|
||||||
|
{"label": "yellow", "bbox_format": "xyxy", "bbox": [500, 500, 1000, 1000]},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
out = _vqa_answer_to_loc(answer)
|
||||||
|
# Each segment is "label <locs>", joined by " ; "
|
||||||
|
assert out == (
|
||||||
|
"blue <loc0000><loc0000><loc0512><loc0512> ; "
|
||||||
|
"yellow <loc0512><loc0512><loc1023><loc1023>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_vqa_answer_to_loc_returns_none_for_non_spatial():
|
def test_vqa_answer_to_loc_returns_none_for_non_spatial():
|
||||||
@@ -115,7 +130,7 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
out = _messages_vqa_to_loc(messages, target_indices=[1])
|
out = _messages_vqa_to_loc(messages, target_indices=[1])
|
||||||
assert out[1]["content"] == "<loc0512><loc0512> cube"
|
assert out[1]["content"] == "cube <loc0512><loc0512>"
|
||||||
# input messages are not mutated
|
# input messages are not mutated
|
||||||
assert messages[1]["content"].startswith("{")
|
assert messages[1]["content"].startswith("{")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user