fix(pi052): VQA <loc> conversion treats coords as 0-1000 normalized

Confirmed empirically on the published dataset: VQA bbox/keypoint
coordinates are Qwen2.5-VL's 0–1000 normalized grounding output, NOT
pixels. Scanning 8207 samples showed x and y both spanning 0..1000
with ~30% of values exceeding the camera's pixel dimensions (which is
impossible if they were pixels).

_vqa_answer_to_loc was dividing by the observation image's H/W, so
e.g. point [742, 158] on a 640x480 wrist cam clamped x to <loc1023>
(the far-right edge) instead of mapping to <loc0760> (~74% across).
Fix: divide by 1000 — the actual Qwen scale. The conversion is now
camera-resolution-independent, so _camera_image_shapes and the
image_shapes plumbing through __call__ / _encode_messages /
_messages_vqa_to_loc are dropped. Tests updated to the new signature
and the 0–1000 round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-19 23:21:28 +02:00
parent 88519cb14c
commit 75507491bf
2 changed files with 84 additions and 141 deletions
@@ -240,45 +240,35 @@ def _sample_indices(value: Any, batch_size: int) -> list[int | None]:
#
# PaliGemma is pre-trained on detection / pointing with a ``<locNNNN>``
# vocabulary (normalized [0, 1023]). The recipe's bbox / keypoint VQA
# answers are stored as JSON with *pixel* coordinates. Training those in
# ``<loc>`` form leverages PaliGemma's prior instead of fighting it (the
# ``<loc>``-token salad). The conversion lives here — not in the dataset
# — so the dataset stays backbone-agnostic (SmolVLA2 keeps the JSON).
# answers are stored as JSON in Qwen2.5-VL's grounding convention:
# **01000 normalized coordinates**, NOT pixels. (Verified empirically
# on the published datasets: x and y both span 0..1000 with ~30% of
# values exceeding the camera's pixel dimensions — they're not pixels.)
# Converting to ``<loc>`` is therefore camera-resolution-independent:
# ``loc_idx = round(coord / 1000 * 1023)``. We do the conversion here —
# not in the dataset — so the dataset stays backbone-agnostic (SmolVLA2
# keeps the JSON).
# ---------------------------------------------------------------------------
def _camera_image_shapes(observation: dict[str, Any]) -> dict[str, tuple[int, int]]:
"""Map each ``observation.images.*`` key to its native ``(height, width)``.
VQA pixel coordinates are relative to the camera frame's native
resolution. PI052's input pipeline applies no spatial resize before
this step, so the observation image tensors are still at that
resolution — the correct reference for normalizing to ``<loc>``.
"""
shapes: dict[str, tuple[int, int]] = {}
for key, value in (observation or {}).items():
if not (isinstance(key, str) and key.startswith("observation.images.")):
continue
shape = getattr(value, "shape", None)
if shape is None or len(shape) < 2:
continue
shapes[key] = (int(shape[-2]), int(shape[-1])) # (H, W); handles (B,C,H,W)/(C,H,W)
return shapes
# The 01000 scale Qwen2.5-VL emits for grounding coordinates.
_VQA_COORD_SCALE = 1000.0
def _loc_token(coord: float, dim: int) -> str:
"""PaliGemma ``<locNNNN>`` for pixel ``coord`` on an axis of size ``dim``."""
idx = round(float(coord) / dim * 1023) if dim > 0 else 0
def _loc_token(coord: float, scale: float = _VQA_COORD_SCALE) -> str:
"""PaliGemma ``<locNNNN>`` for a coord on a ``[0, scale]`` axis."""
idx = round(float(coord) / scale * 1023) if scale > 0 else 0
return f"<loc{max(0, min(1023, idx)):04d}>"
def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | None:
def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
"""Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
PaliGemma convention: a point is ``<locY><locX> label``; a box is
``<locY0><locX0><locY1><locX1> label`` (y before x, each index in
[0, 1023]). Returns ``None`` for non-spatial answers (count /
attribute / spatial-relation) — those keep their JSON form.
Input coordinates are in Qwen2.5-VL's 01000 normalized space (see
module-level note). PaliGemma convention: a point is
``<locY><locX> label``; a box is ``<locY0><locX0><locY1><locX1> label``
(y before x, each index in [0, 1023]). Returns ``None`` for
non-spatial answers (count / attribute / spatial-relation) — those
keep their JSON form.
"""
point = answer.get("point")
if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
@@ -287,7 +277,7 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
except (TypeError, ValueError):
return None
label = str(answer.get("label", "")).strip()
return f"{_loc_token(y, height)}{_loc_token(x, width)} {label}".strip()
return f"{_loc_token(y)}{_loc_token(x)} {label}".strip()
detections = answer.get("detections")
if isinstance(detections, list) and detections:
@@ -304,41 +294,26 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
continue
label = str(det.get("label", "")).strip()
toks = (
f"{_loc_token(y1, height)}{_loc_token(x1, width)}"
f"{_loc_token(y2, height)}{_loc_token(x2, width)}"
f"{_loc_token(y1)}{_loc_token(x1)}"
f"{_loc_token(y2)}{_loc_token(x2)}"
)
parts.append(f"{toks} {label}".strip())
return " ; ".join(parts) if parts else None
return None
def _preceding_image_feature(messages: list[dict[str, Any]], idx: int) -> str | None:
"""Camera ``feature`` of the nearest image block at or before ``idx``."""
for j in range(min(idx, len(messages) - 1), -1, -1):
content = messages[j].get("content")
if not isinstance(content, list):
continue
for block in content:
if isinstance(block, dict) and block.get("type") == "image":
feature = block.get("feature")
if isinstance(feature, str):
return feature
return None
def _messages_vqa_to_loc(
messages: list[dict[str, Any]],
target_indices: list[int],
image_shapes: dict[str, tuple[int, int]] | None,
) -> list[dict[str, Any]]:
"""Rewrite bbox / keypoint VQA *target* answers from JSON to ``<loc>`` text.
Each target turn whose content parses as a spatial VQA answer is
converted, using the camera frame found from the preceding image
block. Non-spatial answers, subtask / memory targets (plain text →
not JSON), and turns with no matching image shape are left untouched.
converted. Non-spatial answers and subtask / memory targets (plain
text → not JSON) are left untouched. Camera-independent: VQA coords
are 01000 normalized, so no observation lookup is needed.
"""
if not image_shapes or not target_indices:
if not target_indices:
return messages
out = list(messages)
for idx in target_indices:
@@ -353,11 +328,7 @@ def _messages_vqa_to_loc(
continue # subtask / memory targets are plain text — skip
if not isinstance(answer, dict):
continue
feature = _preceding_image_feature(out, idx)
if feature is None or feature not in image_shapes:
continue
h, w = image_shapes[feature]
loc_text = _vqa_answer_to_loc(answer, h, w)
loc_text = _vqa_answer_to_loc(answer)
if loc_text is not None:
out[idx] = {**out[idx], "content": loc_text}
return out
@@ -458,9 +429,9 @@ class PI052TextTokenizerStep(ProcessorStep):
return transition
tokenizer = self._ensure_tokenizer()
# Native camera resolutions — the reference frame for converting
# VQA pixel coordinates to PaliGemma <loc> tokens.
image_shapes = _camera_image_shapes(transition.get(TransitionKey.OBSERVATION) or {})
# VQA coords are 01000 normalized (Qwen2.5-VL convention) — the
# <loc> conversion is camera-resolution-independent and needs no
# observation lookup here.
if _is_batched_messages(messages):
indices_iter = _sample_indices(complementary.get("index"), len(messages))
encoded = [
@@ -471,7 +442,6 @@ class PI052TextTokenizerStep(ProcessorStep):
list(tgt_indices),
complementary,
sample_idx=int(s_idx) if s_idx is not None else None,
image_shapes=image_shapes,
)
for msg, streams, tgt_indices, s_idx in zip(
messages,
@@ -491,7 +461,6 @@ class PI052TextTokenizerStep(ProcessorStep):
list(complementary.get("target_message_indices") or []),
complementary,
sample_idx=sample_idx,
image_shapes=image_shapes,
)
]
@@ -545,7 +514,6 @@ class PI052TextTokenizerStep(ProcessorStep):
target_indices: list[int],
complementary: dict[str, Any],
sample_idx: int | None = None,
image_shapes: dict[str, tuple[int, int]] | None = None,
) -> tuple[Tensor, Tensor, Tensor, Tensor, str]:
# Optional: drop non-target messages per the dropout config.
# Keeps the supervised-target indices stable by re-mapping
@@ -564,9 +532,9 @@ class PI052TextTokenizerStep(ProcessorStep):
)
# Rewrite bbox / keypoint VQA target answers from JSON to
# PaliGemma <loc> text — done before stripping so the image
# block (camera frame) is still available to normalize against.
messages = _messages_vqa_to_loc(messages, target_indices, image_shapes)
# PaliGemma <loc> text. Coords are 01000 normalized so this is
# camera-independent.
messages = _messages_vqa_to_loc(messages, target_indices)
# Flatten ``say`` tool calls into ``<say>...</say>`` text before
# stripping, so the spoken reply is actually tokenized and
+50 -75
View File
@@ -19,8 +19,13 @@
PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in
PaliGemma's native ``<locNNNN>`` detection vocabulary so the LM head
reuses the detection prior instead of fighting it (the ``<loc>``-salad
bug). The dataset stays backbone-agnostic JSON; the conversion lives in
PI052's tokenizer. These tests pin the JSON → ``<loc>`` rewrite.
bug). The dataset stores Qwen2.5-VL's grounding output — **01000
normalized** coordinates, *not* pixels. (Verified empirically on the
published datasets: x and y both span 0..1000 with ~30% of values
exceeding the camera's pixel dimensions.) The conversion is therefore
camera-resolution-independent. The dataset stays backbone-agnostic
JSON; the conversion lives in PI052's tokenizer. These tests pin the
JSON → ``<loc>`` rewrite.
"""
import pytest
@@ -28,80 +33,49 @@ import pytest
pytest.importorskip("transformers")
from lerobot.policies.pi052.text_processor_pi052 import ( # noqa: E402
_camera_image_shapes,
_loc_token,
_messages_vqa_to_loc,
_vqa_answer_to_loc,
)
class _FakeTensor:
def __init__(self, shape):
self.shape = shape
def test_camera_image_shapes_extracts_hw_from_image_keys():
obs = {
"observation.images.top": _FakeTensor((1, 3, 240, 320)),
"observation.images.wrist": _FakeTensor((3, 480, 640)),
"observation.state": _FakeTensor((1, 7)),
"task": "x",
}
assert _camera_image_shapes(obs) == {
"observation.images.top": (240, 320),
"observation.images.wrist": (480, 640),
}
def test_camera_image_shapes_handles_empty():
assert _camera_image_shapes({}) == {}
assert _camera_image_shapes(None) == {}
def test_loc_token_normalizes_and_clamps():
assert _loc_token(0, 100) == "<loc0000>"
assert _loc_token(100, 100) == "<loc1023>"
assert _loc_token(50, 100) == f"<loc{round(50 / 100 * 1023):04d}>"
# Default scale is the 01000 Qwen convention.
assert _loc_token(0) == "<loc0000>"
assert _loc_token(1000) == "<loc1023>"
assert _loc_token(500) == f"<loc{round(500 / 1000 * 1023):04d}>"
# out-of-range coordinates clamp into [0, 1023]
assert _loc_token(999, 100) == "<loc1023>"
assert _loc_token(-5, 100) == "<loc0000>"
assert _loc_token(9999) == "<loc1023>"
assert _loc_token(-5) == "<loc0000>"
def test_vqa_answer_to_loc_keypoint():
answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
# height=240, width=320 → y=120/240=0.5, x=160/320=0.5
out = _vqa_answer_to_loc(answer, height=240, width=320)
assert out == "<loc0512><loc0512> blue cube"
def test_vqa_answer_to_loc_keypoint_normalized():
# Qwen 01000 normalized coordinates → camera-independent <loc>.
answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
def test_vqa_answer_to_loc_bbox():
def test_vqa_answer_to_loc_bbox_normalized():
answer = {
"detections": [
{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 320, 240]},
]
"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
}
out = _vqa_answer_to_loc(answer, height=240, width=320)
assert out == "<loc0000><loc0000><loc1023><loc1023> cube"
assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
def test_vqa_answer_to_loc_returns_none_for_non_spatial():
assert _vqa_answer_to_loc({"label": "cubes", "count": 2}, 240, 320) is None
assert _vqa_answer_to_loc({"weird": "payload"}, 240, 320) is None
assert _vqa_answer_to_loc({"label": "cubes", "count": 2}) is None
assert _vqa_answer_to_loc({"weird": "payload"}) is None
def test_messages_vqa_to_loc_rewrites_target_turn():
messages = [
{"role": "user", "content": [{"type": "text", "text": "where is the cube?"}]},
{
"role": "user",
"content": [
{"type": "image", "feature": "observation.images.top"},
{"type": "text", "text": "where is the cube?"},
],
"role": "assistant",
"content": '{"label": "cube", "point_format": "xy", "point": [500, 500]}',
},
{"role": "assistant", "content": '{"label": "cube", "point_format": "xy", "point": [160, 120]}'},
]
shapes = {"observation.images.top": (240, 320)}
out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
out = _messages_vqa_to_loc(messages, target_indices=[1])
assert out[1]["content"] == "<loc0512><loc0512> cube"
# input messages are not mutated
assert messages[1]["content"].startswith("{")
@@ -109,50 +83,51 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched():
messages = [
{"role": "user", "content": [{"type": "image", "feature": "observation.images.top"}]},
{"role": "user", "content": "pick the cube"},
{"role": "assistant", "content": "pick up the cube"},
]
shapes = {"observation.images.top": (240, 320)}
out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
out = _messages_vqa_to_loc(messages, target_indices=[1])
assert out[1]["content"] == "pick up the cube"
def test_messages_vqa_to_loc_noop_without_shapes():
messages = [{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}]
assert _messages_vqa_to_loc(messages, [0], None) is messages
assert _messages_vqa_to_loc(messages, [0], {}) is messages
def test_messages_vqa_to_loc_noop_without_target_indices():
messages = [
{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}
]
assert _messages_vqa_to_loc(messages, []) is messages
# ---------------------------------------------------------------------------
# Round-trip: training-side JSON -> <loc> -> runtime-side parse back to pixels
# Round-trip: training-side JSON -> <loc> -> runtime-side parse back
#
# Pins that the conversion preserves coordinate *order* (JSON is x-first,
# PaliGemma <loc> is y-first) and per-axis normalization. The only loss is
# quantization to the 1024-bucket <loc> grid, so a pixel survives within
# half a bucket (~W/2046, H/2046).
# PaliGemma <loc> is y-first) and the 01000 → [0, 1023] scaling. The
# only loss is quantization to the 1024-bucket <loc> grid, so a coord
# survives within half a bucket (~1000/2046 ≈ 0.49 on the 01000 scale).
# ---------------------------------------------------------------------------
def test_loc_round_trip_keypoint_preserves_pixels():
def test_loc_round_trip_keypoint_preserves_normalized_coords():
from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
h, w = 240, 320
answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
loc = _vqa_answer_to_loc(answer, h, w)
answer = {"label": "blue cube", "point_format": "xy", "point": [640, 480]}
loc = _vqa_answer_to_loc(answer)
parsed = parse_vqa_answer(loc)
nx, ny = parsed["payload"]["point"]
assert abs(nx * w - 160) <= w / 2046 + 1e-6
assert abs(ny * h - 120) <= h / 2046 + 1e-6
# parse_vqa_answer returns [0, 1] normalized; rescale back to 01000.
assert abs(nx * 1000.0 - 640) <= 1000.0 / 2046 + 1e-6
assert abs(ny * 1000.0 - 480) <= 1000.0 / 2046 + 1e-6
assert parsed["payload"]["label"] == "blue cube"
def test_loc_round_trip_bbox_preserves_pixels_and_order():
def test_loc_round_trip_bbox_preserves_order_and_scale():
from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
h, w = 240, 320
answer = {"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [32, 24, 288, 216]}]}
loc = _vqa_answer_to_loc(answer, h, w)
answer = {
"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [100, 200, 800, 900]}]
}
loc = _vqa_answer_to_loc(answer)
parsed = parse_vqa_answer(loc)
x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"]
for got, want, dim in ((x1, 32, w), (y1, 24, h), (x2, 288, w), (y2, 216, h)):
assert abs(got * dim - want) <= dim / 2046 + 1e-6
for got, want in ((x1, 100), (y1, 200), (x2, 800), (y2, 900)):
assert abs(got * 1000.0 - want) <= 1000.0 / 2046 + 1e-6