fix(pi052): VQA <loc> conversion treats coords as 0-1000 normalized

Confirmed empirically on the published dataset: VQA bbox/keypoint coordinates are Qwen2.5-VL's 0–1000 normalized grounding output, NOT pixels. Scanning 8207 samples showed x and y both spanning 0..1000 with ~30% of values exceeding the camera's pixel dimensions (which is impossible if they were pixels). _vqa_answer_to_loc was dividing by the observation image's H/W, so e.g. point [742, 158] on a 640x480 wrist cam clamped x to <loc1023> (the far-right edge) instead of mapping to <loc0760> (~74% across). Fix: divide by 1000 — the actual Qwen scale. The conversion is now camera-resolution-independent, so _camera_image_shapes and the image_shapes plumbing through __call__ / _encode_messages / _messages_vqa_to_loc are dropped. Tests updated to the new signature and the 0–1000 round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-23 01:41:54 +00:00 · 2026-05-19 23:21:28 +02:00
parent 88519cb14c
commit 75507491bf
2 changed files with 84 additions and 141 deletions
@@ -240,45 +240,35 @@ def _sample_indices(value: Any, batch_size: int) -> list[int | None]:
 #
 # PaliGemma is pre-trained on detection / pointing with a ``<locNNNN>``
 # vocabulary (normalized [0, 1023]). The recipe's bbox / keypoint VQA
-# answers are stored as JSON with *pixel* coordinates. Training those in
+# answers are stored as JSON in Qwen2.5-VL's grounding convention:
-# ``<loc>`` form leverages PaliGemma's prior instead of fighting it (the
+# **0–1000 normalized coordinates**, NOT pixels. (Verified empirically
-# ``<loc>``-token salad). The conversion lives here — not in the dataset
+# on the published datasets: x and y both span 0..1000 with ~30% of
-# — so the dataset stays backbone-agnostic (SmolVLA2 keeps the JSON).
+# values exceeding the camera's pixel dimensions — they're not pixels.)
 # Converting to ``<loc>`` is therefore camera-resolution-independent:
 # ``loc_idx = round(coord / 1000 * 1023)``. We do the conversion here —
 # not in the dataset — so the dataset stays backbone-agnostic (SmolVLA2
 # keeps the JSON).
 # ---------------------------------------------------------------------------
-
+# The 0–1000 scale Qwen2.5-VL emits for grounding coordinates.
-def _camera_image_shapes(observation: dict[str, Any]) -> dict[str, tuple[int, int]]:
+_VQA_COORD_SCALE = 1000.0
    """Map each ``observation.images.*`` key to its native ``(height, width)``.
    VQA pixel coordinates are relative to the camera frame's native
    resolution. PI052's input pipeline applies no spatial resize before
    this step, so the observation image tensors are still at that
    resolution — the correct reference for normalizing to ``<loc>``.
    """
    shapes: dict[str, tuple[int, int]] = {}
    for key, value in (observation or {}).items():
        if not (isinstance(key, str) and key.startswith("observation.images.")):
            continue
        shape = getattr(value, "shape", None)
        if shape is None or len(shape) < 2:
            continue
        shapes[key] = (int(shape[-2]), int(shape[-1]))  # (H, W); handles (B,C,H,W)/(C,H,W)
    return shapes
-def _loc_token(coord: float, dim: int) -> str:
+def _loc_token(coord: float, scale: float = _VQA_COORD_SCALE) -> str:
-    """PaliGemma ``<locNNNN>`` for pixel ``coord`` on an axis of size ``dim``."""
+    """PaliGemma ``<locNNNN>`` for a coord on a ``[0, scale]`` axis."""
-    idx = round(float(coord) / dim * 1023) if dim > 0 else 0
+    idx = round(float(coord) / scale * 1023) if scale > 0 else 0
    return f"<loc{max(0, min(1023, idx)):04d}>"
-def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | None:
+def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
    """Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
-    PaliGemma convention: a point is ``<locY><locX> label``; a box is
+    Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see
-    ``<locY0><locX0><locY1><locX1> label`` (y before x, each index in
+    module-level note). PaliGemma convention: a point is
-    [0, 1023]). Returns ``None`` for non-spatial answers (count /
+    ``<locY><locX> label``; a box is ``<locY0><locX0><locY1><locX1> label``
-    attribute / spatial-relation) — those keep their JSON form.
+    (y before x, each index in [0, 1023]). Returns ``None`` for
    non-spatial answers (count / attribute / spatial-relation) — those
    keep their JSON form.
    """
    point = answer.get("point")
    if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
@@ -287,7 +277,7 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
        except (TypeError, ValueError):
            return None
        label = str(answer.get("label", "")).strip()
-        return f"{_loc_token(y, height)}{_loc_token(x, width)} {label}".strip()
+        return f"{_loc_token(y)}{_loc_token(x)} {label}".strip()
    detections = answer.get("detections")
    if isinstance(detections, list) and detections:
@@ -304,41 +294,26 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
                continue
            label = str(det.get("label", "")).strip()
            toks = (
-                f"{_loc_token(y1, height)}{_loc_token(x1, width)}"
+                f"{_loc_token(y1)}{_loc_token(x1)}"
-                f"{_loc_token(y2, height)}{_loc_token(x2, width)}"
+                f"{_loc_token(y2)}{_loc_token(x2)}"
            )
            parts.append(f"{toks} {label}".strip())
        return " ; ".join(parts) if parts else None
    return None
 def _preceding_image_feature(messages: list[dict[str, Any]], idx: int) -> str | None:
    """Camera ``feature`` of the nearest image block at or before ``idx``."""
    for j in range(min(idx, len(messages) - 1), -1, -1):
        content = messages[j].get("content")
        if not isinstance(content, list):
            continue
        for block in content:
            if isinstance(block, dict) and block.get("type") == "image":
                feature = block.get("feature")
                if isinstance(feature, str):
                    return feature
    return None
 def _messages_vqa_to_loc(
    messages: list[dict[str, Any]],
    target_indices: list[int],
    image_shapes: dict[str, tuple[int, int]] | None,
 ) -> list[dict[str, Any]]:
    """Rewrite bbox / keypoint VQA *target* answers from JSON to ``<loc>`` text.
    Each target turn whose content parses as a spatial VQA answer is
-    converted, using the camera frame found from the preceding image
+    converted. Non-spatial answers and subtask / memory targets (plain
-    block. Non-spatial answers, subtask / memory targets (plain text →
+    text → not JSON) are left untouched. Camera-independent: VQA coords
-    not JSON), and turns with no matching image shape are left untouched.
+    are 0–1000 normalized, so no observation lookup is needed.
    """
-    if not image_shapes or not target_indices:
+    if not target_indices:
        return messages
    out = list(messages)
    for idx in target_indices:
@@ -353,11 +328,7 @@ def _messages_vqa_to_loc(
            continue  # subtask / memory targets are plain text — skip
        if not isinstance(answer, dict):
            continue
-        feature = _preceding_image_feature(out, idx)
+        loc_text = _vqa_answer_to_loc(answer)
        if feature is None or feature not in image_shapes:
            continue
        h, w = image_shapes[feature]
        loc_text = _vqa_answer_to_loc(answer, h, w)
        if loc_text is not None:
            out[idx] = {**out[idx], "content": loc_text}
    return out
@@ -458,9 +429,9 @@ class PI052TextTokenizerStep(ProcessorStep):
            return transition
        tokenizer = self._ensure_tokenizer()
-        # Native camera resolutions — the reference frame for converting
+        # VQA coords are 0–1000 normalized (Qwen2.5-VL convention) — the
-        # VQA pixel coordinates to PaliGemma <loc> tokens.
+        # <loc> conversion is camera-resolution-independent and needs no
-        image_shapes = _camera_image_shapes(transition.get(TransitionKey.OBSERVATION) or {})
+        # observation lookup here.
        if _is_batched_messages(messages):
            indices_iter = _sample_indices(complementary.get("index"), len(messages))
            encoded = [
@@ -471,7 +442,6 @@ class PI052TextTokenizerStep(ProcessorStep):
                    list(tgt_indices),
                    complementary,
                    sample_idx=int(s_idx) if s_idx is not None else None,
                    image_shapes=image_shapes,
                )
                for msg, streams, tgt_indices, s_idx in zip(
                    messages,
@@ -491,7 +461,6 @@ class PI052TextTokenizerStep(ProcessorStep):
                    list(complementary.get("target_message_indices") or []),
                    complementary,
                    sample_idx=sample_idx,
                    image_shapes=image_shapes,
                )
            ]
@@ -545,7 +514,6 @@ class PI052TextTokenizerStep(ProcessorStep):
        target_indices: list[int],
        complementary: dict[str, Any],
        sample_idx: int | None = None,
        image_shapes: dict[str, tuple[int, int]] | None = None,
    ) -> tuple[Tensor, Tensor, Tensor, Tensor, str]:
        # Optional: drop non-target messages per the dropout config.
        # Keeps the supervised-target indices stable by re-mapping
@@ -564,9 +532,9 @@ class PI052TextTokenizerStep(ProcessorStep):
            )
        # Rewrite bbox / keypoint VQA target answers from JSON to
-        # PaliGemma <loc> text — done before stripping so the image
+        # PaliGemma <loc> text. Coords are 0–1000 normalized so this is
-        # block (camera frame) is still available to normalize against.
+        # camera-independent.
-        messages = _messages_vqa_to_loc(messages, target_indices, image_shapes)
+        messages = _messages_vqa_to_loc(messages, target_indices)
        # Flatten ``say`` tool calls into ``<say>...</say>`` text before
        # stripping, so the spoken reply is actually tokenized and
@@ -19,8 +19,13 @@
 PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in
 PaliGemma's native ``<locNNNN>`` detection vocabulary so the LM head
 reuses the detection prior instead of fighting it (the ``<loc>``-salad
-bug). The dataset stays backbone-agnostic JSON; the conversion lives in
+bug). The dataset stores Qwen2.5-VL's grounding output — **0–1000
-PI052's tokenizer. These tests pin the JSON → ``<loc>`` rewrite.
+normalized** coordinates, *not* pixels. (Verified empirically on the
 published datasets: x and y both span 0..1000 with ~30% of values
 exceeding the camera's pixel dimensions.) The conversion is therefore
 camera-resolution-independent. The dataset stays backbone-agnostic
 JSON; the conversion lives in PI052's tokenizer. These tests pin the
 JSON → ``<loc>`` rewrite.
 """
 import pytest
@@ -28,80 +33,49 @@ import pytest
 pytest.importorskip("transformers")
 from lerobot.policies.pi052.text_processor_pi052 import (  # noqa: E402
    _camera_image_shapes,
    _loc_token,
    _messages_vqa_to_loc,
    _vqa_answer_to_loc,
 )
 class _FakeTensor:
    def __init__(self, shape):
        self.shape = shape
 def test_camera_image_shapes_extracts_hw_from_image_keys():
    obs = {
        "observation.images.top": _FakeTensor((1, 3, 240, 320)),
        "observation.images.wrist": _FakeTensor((3, 480, 640)),
        "observation.state": _FakeTensor((1, 7)),
        "task": "x",
    }
    assert _camera_image_shapes(obs) == {
        "observation.images.top": (240, 320),
        "observation.images.wrist": (480, 640),
    }
 def test_camera_image_shapes_handles_empty():
    assert _camera_image_shapes({}) == {}
    assert _camera_image_shapes(None) == {}
 def test_loc_token_normalizes_and_clamps():
-    assert _loc_token(0, 100) == "<loc0000>"
+    # Default scale is the 0–1000 Qwen convention.
-    assert _loc_token(100, 100) == "<loc1023>"
+    assert _loc_token(0) == "<loc0000>"
-    assert _loc_token(50, 100) == f"<loc{round(50 / 100 * 1023):04d}>"
+    assert _loc_token(1000) == "<loc1023>"
    assert _loc_token(500) == f"<loc{round(500 / 1000 * 1023):04d}>"
    # out-of-range coordinates clamp into [0, 1023]
-    assert _loc_token(999, 100) == "<loc1023>"
+    assert _loc_token(9999) == "<loc1023>"
-    assert _loc_token(-5, 100) == "<loc0000>"
+    assert _loc_token(-5) == "<loc0000>"
-def test_vqa_answer_to_loc_keypoint():
+def test_vqa_answer_to_loc_keypoint_normalized():
-    answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
+    # Qwen 0–1000 normalized coordinates → camera-independent <loc>.
-    # height=240, width=320 → y=120/240=0.5, x=160/320=0.5
+    answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
-    out = _vqa_answer_to_loc(answer, height=240, width=320)
+    assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
    assert out == "<loc0512><loc0512> blue cube"
-def test_vqa_answer_to_loc_bbox():
+def test_vqa_answer_to_loc_bbox_normalized():
    answer = {
-        "detections": [
+        "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
            {"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 320, 240]},
        ]
    }
-    out = _vqa_answer_to_loc(answer, height=240, width=320)
+    assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
    assert out == "<loc0000><loc0000><loc1023><loc1023> cube"
 def test_vqa_answer_to_loc_returns_none_for_non_spatial():
-    assert _vqa_answer_to_loc({"label": "cubes", "count": 2}, 240, 320) is None
+    assert _vqa_answer_to_loc({"label": "cubes", "count": 2}) is None
-    assert _vqa_answer_to_loc({"weird": "payload"}, 240, 320) is None
+    assert _vqa_answer_to_loc({"weird": "payload"}) is None
 def test_messages_vqa_to_loc_rewrites_target_turn():
    messages = [
        {"role": "user", "content": [{"type": "text", "text": "where is the cube?"}]},
        {
-            "role": "user",
+            "role": "assistant",
-            "content": [
+            "content": '{"label": "cube", "point_format": "xy", "point": [500, 500]}',
                {"type": "image", "feature": "observation.images.top"},
                {"type": "text", "text": "where is the cube?"},
            ],
        },
        {"role": "assistant", "content": '{"label": "cube", "point_format": "xy", "point": [160, 120]}'},
    ]
-    shapes = {"observation.images.top": (240, 320)}
+    out = _messages_vqa_to_loc(messages, target_indices=[1])
    out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
    assert out[1]["content"] == "<loc0512><loc0512> cube"
    # input messages are not mutated
    assert messages[1]["content"].startswith("{")
@@ -109,50 +83,51 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
 def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched():
    messages = [
-        {"role": "user", "content": [{"type": "image", "feature": "observation.images.top"}]},
+        {"role": "user", "content": "pick the cube"},
        {"role": "assistant", "content": "pick up the cube"},
    ]
-    shapes = {"observation.images.top": (240, 320)}
+    out = _messages_vqa_to_loc(messages, target_indices=[1])
    out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
    assert out[1]["content"] == "pick up the cube"
-def test_messages_vqa_to_loc_noop_without_shapes():
+def test_messages_vqa_to_loc_noop_without_target_indices():
-    messages = [{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}]
+    messages = [
-    assert _messages_vqa_to_loc(messages, [0], None) is messages
+        {"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}
-    assert _messages_vqa_to_loc(messages, [0], {}) is messages
+    ]
    assert _messages_vqa_to_loc(messages, []) is messages
 # ---------------------------------------------------------------------------
-# Round-trip: training-side JSON -> <loc> -> runtime-side parse back to pixels
+# Round-trip: training-side JSON -> <loc> -> runtime-side parse back
 #
 # Pins that the conversion preserves coordinate *order* (JSON is x-first,
-# PaliGemma <loc> is y-first) and per-axis normalization. The only loss is
+# PaliGemma <loc> is y-first) and the 0–1000 → [0, 1023] scaling. The
-# quantization to the 1024-bucket <loc> grid, so a pixel survives within
+# only loss is quantization to the 1024-bucket <loc> grid, so a coord
-# half a bucket (~W/2046, H/2046).
+# survives within half a bucket (~1000/2046 ≈ 0.49 on the 0–1000 scale).
 # ---------------------------------------------------------------------------
-def test_loc_round_trip_keypoint_preserves_pixels():
+def test_loc_round_trip_keypoint_preserves_normalized_coords():
    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
-    h, w = 240, 320
+    answer = {"label": "blue cube", "point_format": "xy", "point": [640, 480]}
-    answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
+    loc = _vqa_answer_to_loc(answer)
    loc = _vqa_answer_to_loc(answer, h, w)
    parsed = parse_vqa_answer(loc)
    nx, ny = parsed["payload"]["point"]
-    assert abs(nx * w - 160) <= w / 2046 + 1e-6
+    # parse_vqa_answer returns [0, 1] normalized; rescale back to 0–1000.
-    assert abs(ny * h - 120) <= h / 2046 + 1e-6
+    assert abs(nx * 1000.0 - 640) <= 1000.0 / 2046 + 1e-6
    assert abs(ny * 1000.0 - 480) <= 1000.0 / 2046 + 1e-6
    assert parsed["payload"]["label"] == "blue cube"
-def test_loc_round_trip_bbox_preserves_pixels_and_order():
+def test_loc_round_trip_bbox_preserves_order_and_scale():
    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
-    h, w = 240, 320
+    answer = {
-    answer = {"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [32, 24, 288, 216]}]}
+        "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [100, 200, 800, 900]}]
-    loc = _vqa_answer_to_loc(answer, h, w)
+    }
    loc = _vqa_answer_to_loc(answer)
    parsed = parse_vqa_answer(loc)
    x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"]
-    for got, want, dim in ((x1, 32, w), (y1, 24, h), (x2, 288, w), (y2, 216, h)):
+    for got, want in ((x1, 100), (y1, 200), (x2, 800), (y2, 900)):
-        assert abs(got * dim - want) <= dim / 2046 + 1e-6
+        assert abs(got * 1000.0 - want) <= 1000.0 / 2046 + 1e-6