From 5bb2da4da6c29f7d90f405b2c16062771adbb3d0 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 20 May 2026 18:56:48 +0200
Subject: [PATCH] fix(pi052): VQA target format = "label <loc><loc>" not
 "<loc><loc> label"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The trained model collapsed to spewing 40+ <loc> tokens for *every*
prompt — subtask, memory, anything — because VQA targets were supervised
to *start* with <loc>. With ~25% of all text samples beginning with a
<loc> token, the LM head learned "Assistant: → <loc>" as a strong
attractor; once one loc is emitted, autoregression chains the rest.

Flip the format so every text target — subtask, memory, speech, AND VQA
— starts with a regular word. The model still learns the <loc>
vocabulary for the spatial portion of the answer, but loc can no
longer be the first generation step out of a clean prompt.

Examples:
  point  : "green box <loc0162><loc0759>"
  bbox   : "cube <loc0082>…<loc0409>"
  multi  : "blue <locs> ; yellow <locs>"

The runtime parser (parse_loc_answer) strips loc tokens and uses the
remainder as label, so it's order-tolerant and works under either
format. Old loc-first checkpoints still parse cleanly at inference;
new training will use label-first.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/pi052/text_processor_pi052.py    | 30 ++++++++++++++-----
 .../policies/smolvla2/inference/vqa.py        | 12 +++++---
 tests/policies/pi052/test_pi052_vqa_loc.py    | 23 +++++++++++---
 3 files changed, 50 insertions(+), 15 deletions(-)
diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py
index ea5951b4b..18c926a0b 100644
--- a/src/lerobot/policies/pi052/text_processor_pi052.py
+++ b/src/lerobot/policies/pi052/text_processor_pi052.py
@@ -284,11 +284,23 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
     """Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
 
     Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see
-    module-level note). PaliGemma convention: a point is
-    ``<locY><locX> label``; a box is ``<locY0><locX0><locY1><locX1> label``
-    (y before x, each index in [0, 1023]). Returns ``None`` for
-    non-spatial answers (count / attribute / spatial-relation) — those
-    keep their JSON form.
+    module-level note). y is emitted before x for each coordinate pair
+    (PaliGemma convention), with the integer indices in [0, 1023].
+
+    **Format: label first, locs after.** PaliGemma's pretraining puts
+    locs first (``<loc><loc> label``), but for our small-dataset VQA
+    blend that turns the LM head into a loc-emission attractor at every
+    ``Assistant:`` position — VQA targets share their first supervised
+    token with ~25% of all text samples, and the head collapses to
+    emitting ``<loc>`` regardless of the prompt. Putting the label
+    first (``label <locY><locX>``) means every text sample (subtask,
+    memory, VQA, …) starts the supervised target with a real word,
+    breaking the attractor. The model still learns the loc vocabulary
+    for the *spatial* portion of the answer; it just can't fire it as
+    the first generation step from a clean prompt.
+
+    Returns ``None`` for non-spatial answers (count / attribute /
+    spatial-relation) — those keep their JSON form.
     """
     point = answer.get("point")
     if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
@@ -297,7 +309,9 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
         except (TypeError, ValueError):
             return None
         label = str(answer.get("label", "")).strip()
-        return f"{_loc_token(y)}{_loc_token(x)} {label}".strip()
+        if not label:
+            return None
+        return f"{label} {_loc_token(y)}{_loc_token(x)}"
 
     detections = answer.get("detections")
     if isinstance(detections, list) and detections:
@@ -313,11 +327,13 @@ def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
             except (TypeError, ValueError):
                 continue
             label = str(det.get("label", "")).strip()
+            if not label:
+                continue
             toks = (
                 f"{_loc_token(y1)}{_loc_token(x1)}"
                 f"{_loc_token(y2)}{_loc_token(x2)}"
             )
-            parts.append(f"{toks} {label}".strip())
+            parts.append(f"{label} {toks}")
         return " ; ".join(parts) if parts else None
     return None
 
diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py
index 74a672265..0992a0a12 100644
--- a/src/lerobot/policies/smolvla2/inference/vqa.py
+++ b/src/lerobot/policies/smolvla2/inference/vqa.py
@@ -133,10 +133,14 @@ def parse_loc_answer(answer: str) -> dict | None:
     """Parse a PaliGemma ``<loc>``-format spatial VQA answer.
 
     PI052 trains spatial answers in PaliGemma's native detection
-    vocabulary: a point is ``<locY><locX> label``, a box is
-    ``<locY0><locX0><locY1><locX1> label``, and multiple boxes are joined
-    by `` ; ``. Coordinates come back *normalized* ([0, 1]); the overlay
-    denormalizes them against the chosen camera frame's pixel size.
+    vocabulary, label-first: a point is ``<label> <locY><locX>``, a box
+    is ``<label> <locY0><locX0><locY1><locX1>``, and multiple boxes are
+    joined by `` ; `` (e.g. ``cube <loc..><loc..><loc..><loc..> ; box
+    <loc..><loc..><loc..><loc..>``). Loc-first formats are also accepted
+    — this parser strips loc tokens and treats the remainder as the
+    label, so order is irrelevant. Coordinates come back *normalized*
+    ([0, 1]); the overlay denormalizes them against the chosen camera
+    frame's pixel size.
 
     Returns ``{"kind", "payload", "normalized": True}`` on success
     (``payload`` mirrors the JSON shapes so the overlay code is shared),
diff --git a/tests/policies/pi052/test_pi052_vqa_loc.py b/tests/policies/pi052/test_pi052_vqa_loc.py
index a1e145350..9207e4eb4 100644
--- a/tests/policies/pi052/test_pi052_vqa_loc.py
+++ b/tests/policies/pi052/test_pi052_vqa_loc.py
@@ -89,16 +89,31 @@ def test_loc_token_normalizes_and_clamps():
 
 
 def test_vqa_answer_to_loc_keypoint_normalized():
-    # Qwen 0–1000 normalized coordinates → camera-independent <loc>.
+    # Label-first: avoids the "Assistant: → <loc>" attractor at training.
     answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
-    assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
+    assert _vqa_answer_to_loc(answer) == "blue cube <loc0512><loc0512>"
 
 
 def test_vqa_answer_to_loc_bbox_normalized():
     answer = {
         "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
     }
-    assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
+    assert _vqa_answer_to_loc(answer) == "cube <loc0000><loc0000><loc1023><loc1023>"
+
+
+def test_vqa_answer_to_loc_multiple_detections_separator():
+    answer = {
+        "detections": [
+            {"label": "blue", "bbox_format": "xyxy", "bbox": [0, 0, 500, 500]},
+            {"label": "yellow", "bbox_format": "xyxy", "bbox": [500, 500, 1000, 1000]},
+        ]
+    }
+    out = _vqa_answer_to_loc(answer)
+    # Each segment is "label <locs>", joined by " ; "
+    assert out == (
+        "blue <loc0000><loc0000><loc0512><loc0512> ; "
+        "yellow <loc0512><loc0512><loc1023><loc1023>"
+    )
 
 
 def test_vqa_answer_to_loc_returns_none_for_non_spatial():
@@ -115,7 +130,7 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
         },
     ]
     out = _messages_vqa_to_loc(messages, target_indices=[1])
-    assert out[1]["content"] == "<loc0512><loc0512> cube"
+    assert out[1]["content"] == "cube <loc0512><loc0512>"
     # input messages are not mutated
     assert messages[1]["content"].startswith("{")