From 5bb2da4da6c29f7d90f405b2c16062771adbb3d0 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 20 May 2026 18:56:48 +0200 Subject: [PATCH] fix(pi052): VQA target format = "label " not " label" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trained model collapsed to spewing 40+ tokens for *every* prompt — subtask, memory, anything — because VQA targets were supervised to *start* with . With ~25% of all text samples beginning with a token, the LM head learned "Assistant: → " as a strong attractor; once one loc is emitted, autoregression chains the rest. Flip the format so every text target — subtask, memory, speech, AND VQA — starts with a regular word. The model still learns the vocabulary for the spatial portion of the answer, but loc can no longer be the first generation step out of a clean prompt. Examples: point : "green box " bbox : "cube " multi : "blue ; yellow " The runtime parser (parse_loc_answer) strips loc tokens and uses the remainder as label, so it's order-tolerant and works under either format. Old loc-first checkpoints still parse cleanly at inference; new training will use label-first. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/pi052/text_processor_pi052.py | 30 ++++++++++++++----- .../policies/smolvla2/inference/vqa.py | 12 +++++--- tests/policies/pi052/test_pi052_vqa_loc.py | 23 +++++++++++--- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py index ea5951b4b..18c926a0b 100644 --- a/src/lerobot/policies/pi052/text_processor_pi052.py +++ b/src/lerobot/policies/pi052/text_processor_pi052.py @@ -284,11 +284,23 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None: """Convert a bbox / keypoint VQA answer dict to PaliGemma ```` text. Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see - module-level note). PaliGemma convention: a point is - `` label``; a box is `` label`` - (y before x, each index in [0, 1023]). Returns ``None`` for - non-spatial answers (count / attribute / spatial-relation) — those - keep their JSON form. + module-level note). y is emitted before x for each coordinate pair + (PaliGemma convention), with the integer indices in [0, 1023]. + + **Format: label first, locs after.** PaliGemma's pretraining puts + locs first (`` label``), but for our small-dataset VQA + blend that turns the LM head into a loc-emission attractor at every + ``Assistant:`` position — VQA targets share their first supervised + token with ~25% of all text samples, and the head collapses to + emitting ```` regardless of the prompt. Putting the label + first (``label ``) means every text sample (subtask, + memory, VQA, …) starts the supervised target with a real word, + breaking the attractor. The model still learns the loc vocabulary + for the *spatial* portion of the answer; it just can't fire it as + the first generation step from a clean prompt. + + Returns ``None`` for non-spatial answers (count / attribute / + spatial-relation) — those keep their JSON form. """ point = answer.get("point") if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer: @@ -297,7 +309,9 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None: except (TypeError, ValueError): return None label = str(answer.get("label", "")).strip() - return f"{_loc_token(y)}{_loc_token(x)} {label}".strip() + if not label: + return None + return f"{label} {_loc_token(y)}{_loc_token(x)}" detections = answer.get("detections") if isinstance(detections, list) and detections: @@ -313,11 +327,13 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None: except (TypeError, ValueError): continue label = str(det.get("label", "")).strip() + if not label: + continue toks = ( f"{_loc_token(y1)}{_loc_token(x1)}" f"{_loc_token(y2)}{_loc_token(x2)}" ) - parts.append(f"{toks} {label}".strip()) + parts.append(f"{label} {toks}") return " ; ".join(parts) if parts else None return None diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py index 74a672265..0992a0a12 100644 --- a/src/lerobot/policies/smolvla2/inference/vqa.py +++ b/src/lerobot/policies/smolvla2/inference/vqa.py @@ -133,10 +133,14 @@ def parse_loc_answer(answer: str) -> dict | None: """Parse a PaliGemma ````-format spatial VQA answer. PI052 trains spatial answers in PaliGemma's native detection - vocabulary: a point is `` label``, a box is - `` label``, and multiple boxes are joined - by `` ; ``. Coordinates come back *normalized* ([0, 1]); the overlay - denormalizes them against the chosen camera frame's pixel size. + vocabulary, label-first: a point is ``