fix(pi052): VQA <loc> conversion treats coords as 0-1000 normalized

Confirmed empirically on the published dataset: VQA bbox/keypoint
coordinates are Qwen2.5-VL's 0–1000 normalized grounding output, NOT
pixels. Scanning 8207 samples showed x and y both spanning 0..1000
with ~30% of values exceeding the camera's pixel dimensions (which is
impossible if they were pixels).

_vqa_answer_to_loc was dividing by the observation image's H/W, so
e.g. point [742, 158] on a 640x480 wrist cam clamped x to <loc1023>
(the far-right edge) instead of mapping to <loc0760> (~74% across).
Fix: divide by 1000 — the actual Qwen scale. The conversion is now
camera-resolution-independent, so _camera_image_shapes and the
image_shapes plumbing through __call__ / _encode_messages /
_messages_vqa_to_loc are dropped. Tests updated to the new signature
and the 0–1000 round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-19 23:21:28 +02:00
parent 88519cb14c
commit 75507491bf
2 changed files with 84 additions and 141 deletions
@@ -240,45 +240,35 @@ def _sample_indices(value: Any, batch_size: int) -> list[int | None]:
# #
# PaliGemma is pre-trained on detection / pointing with a ``<locNNNN>`` # PaliGemma is pre-trained on detection / pointing with a ``<locNNNN>``
# vocabulary (normalized [0, 1023]). The recipe's bbox / keypoint VQA # vocabulary (normalized [0, 1023]). The recipe's bbox / keypoint VQA
# answers are stored as JSON with *pixel* coordinates. Training those in # answers are stored as JSON in Qwen2.5-VL's grounding convention:
# ``<loc>`` form leverages PaliGemma's prior instead of fighting it (the # **01000 normalized coordinates**, NOT pixels. (Verified empirically
# ``<loc>``-token salad). The conversion lives here — not in the dataset # on the published datasets: x and y both span 0..1000 with ~30% of
# — so the dataset stays backbone-agnostic (SmolVLA2 keeps the JSON). # values exceeding the camera's pixel dimensions — they're not pixels.)
# Converting to ``<loc>`` is therefore camera-resolution-independent:
# ``loc_idx = round(coord / 1000 * 1023)``. We do the conversion here —
# not in the dataset — so the dataset stays backbone-agnostic (SmolVLA2
# keeps the JSON).
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# The 01000 scale Qwen2.5-VL emits for grounding coordinates.
def _camera_image_shapes(observation: dict[str, Any]) -> dict[str, tuple[int, int]]: _VQA_COORD_SCALE = 1000.0
"""Map each ``observation.images.*`` key to its native ``(height, width)``.
VQA pixel coordinates are relative to the camera frame's native
resolution. PI052's input pipeline applies no spatial resize before
this step, so the observation image tensors are still at that
resolution — the correct reference for normalizing to ``<loc>``.
"""
shapes: dict[str, tuple[int, int]] = {}
for key, value in (observation or {}).items():
if not (isinstance(key, str) and key.startswith("observation.images.")):
continue
shape = getattr(value, "shape", None)
if shape is None or len(shape) < 2:
continue
shapes[key] = (int(shape[-2]), int(shape[-1])) # (H, W); handles (B,C,H,W)/(C,H,W)
return shapes
def _loc_token(coord: float, dim: int) -> str: def _loc_token(coord: float, scale: float = _VQA_COORD_SCALE) -> str:
"""PaliGemma ``<locNNNN>`` for pixel ``coord`` on an axis of size ``dim``.""" """PaliGemma ``<locNNNN>`` for a coord on a ``[0, scale]`` axis."""
idx = round(float(coord) / dim * 1023) if dim > 0 else 0 idx = round(float(coord) / scale * 1023) if scale > 0 else 0
return f"<loc{max(0, min(1023, idx)):04d}>" return f"<loc{max(0, min(1023, idx)):04d}>"
def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | None: def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None:
"""Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text. """Convert a bbox / keypoint VQA answer dict to PaliGemma ``<loc>`` text.
PaliGemma convention: a point is ``<locY><locX> label``; a box is Input coordinates are in Qwen2.5-VL's 01000 normalized space (see
``<locY0><locX0><locY1><locX1> label`` (y before x, each index in module-level note). PaliGemma convention: a point is
[0, 1023]). Returns ``None`` for non-spatial answers (count / ``<locY><locX> label``; a box is ``<locY0><locX0><locY1><locX1> label``
attribute / spatial-relation) — those keep their JSON form. (y before x, each index in [0, 1023]). Returns ``None`` for
non-spatial answers (count / attribute / spatial-relation) — those
keep their JSON form.
""" """
point = answer.get("point") point = answer.get("point")
if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer: if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer:
@@ -287,7 +277,7 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
except (TypeError, ValueError): except (TypeError, ValueError):
return None return None
label = str(answer.get("label", "")).strip() label = str(answer.get("label", "")).strip()
return f"{_loc_token(y, height)}{_loc_token(x, width)} {label}".strip() return f"{_loc_token(y)}{_loc_token(x)} {label}".strip()
detections = answer.get("detections") detections = answer.get("detections")
if isinstance(detections, list) and detections: if isinstance(detections, list) and detections:
@@ -304,41 +294,26 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str |
continue continue
label = str(det.get("label", "")).strip() label = str(det.get("label", "")).strip()
toks = ( toks = (
f"{_loc_token(y1, height)}{_loc_token(x1, width)}" f"{_loc_token(y1)}{_loc_token(x1)}"
f"{_loc_token(y2, height)}{_loc_token(x2, width)}" f"{_loc_token(y2)}{_loc_token(x2)}"
) )
parts.append(f"{toks} {label}".strip()) parts.append(f"{toks} {label}".strip())
return " ; ".join(parts) if parts else None return " ; ".join(parts) if parts else None
return None return None
def _preceding_image_feature(messages: list[dict[str, Any]], idx: int) -> str | None:
"""Camera ``feature`` of the nearest image block at or before ``idx``."""
for j in range(min(idx, len(messages) - 1), -1, -1):
content = messages[j].get("content")
if not isinstance(content, list):
continue
for block in content:
if isinstance(block, dict) and block.get("type") == "image":
feature = block.get("feature")
if isinstance(feature, str):
return feature
return None
def _messages_vqa_to_loc( def _messages_vqa_to_loc(
messages: list[dict[str, Any]], messages: list[dict[str, Any]],
target_indices: list[int], target_indices: list[int],
image_shapes: dict[str, tuple[int, int]] | None,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
"""Rewrite bbox / keypoint VQA *target* answers from JSON to ``<loc>`` text. """Rewrite bbox / keypoint VQA *target* answers from JSON to ``<loc>`` text.
Each target turn whose content parses as a spatial VQA answer is Each target turn whose content parses as a spatial VQA answer is
converted, using the camera frame found from the preceding image converted. Non-spatial answers and subtask / memory targets (plain
block. Non-spatial answers, subtask / memory targets (plain text → text → not JSON) are left untouched. Camera-independent: VQA coords
not JSON), and turns with no matching image shape are left untouched. are 01000 normalized, so no observation lookup is needed.
""" """
if not image_shapes or not target_indices: if not target_indices:
return messages return messages
out = list(messages) out = list(messages)
for idx in target_indices: for idx in target_indices:
@@ -353,11 +328,7 @@ def _messages_vqa_to_loc(
continue # subtask / memory targets are plain text — skip continue # subtask / memory targets are plain text — skip
if not isinstance(answer, dict): if not isinstance(answer, dict):
continue continue
feature = _preceding_image_feature(out, idx) loc_text = _vqa_answer_to_loc(answer)
if feature is None or feature not in image_shapes:
continue
h, w = image_shapes[feature]
loc_text = _vqa_answer_to_loc(answer, h, w)
if loc_text is not None: if loc_text is not None:
out[idx] = {**out[idx], "content": loc_text} out[idx] = {**out[idx], "content": loc_text}
return out return out
@@ -458,9 +429,9 @@ class PI052TextTokenizerStep(ProcessorStep):
return transition return transition
tokenizer = self._ensure_tokenizer() tokenizer = self._ensure_tokenizer()
# Native camera resolutions — the reference frame for converting # VQA coords are 01000 normalized (Qwen2.5-VL convention) — the
# VQA pixel coordinates to PaliGemma <loc> tokens. # <loc> conversion is camera-resolution-independent and needs no
image_shapes = _camera_image_shapes(transition.get(TransitionKey.OBSERVATION) or {}) # observation lookup here.
if _is_batched_messages(messages): if _is_batched_messages(messages):
indices_iter = _sample_indices(complementary.get("index"), len(messages)) indices_iter = _sample_indices(complementary.get("index"), len(messages))
encoded = [ encoded = [
@@ -471,7 +442,6 @@ class PI052TextTokenizerStep(ProcessorStep):
list(tgt_indices), list(tgt_indices),
complementary, complementary,
sample_idx=int(s_idx) if s_idx is not None else None, sample_idx=int(s_idx) if s_idx is not None else None,
image_shapes=image_shapes,
) )
for msg, streams, tgt_indices, s_idx in zip( for msg, streams, tgt_indices, s_idx in zip(
messages, messages,
@@ -491,7 +461,6 @@ class PI052TextTokenizerStep(ProcessorStep):
list(complementary.get("target_message_indices") or []), list(complementary.get("target_message_indices") or []),
complementary, complementary,
sample_idx=sample_idx, sample_idx=sample_idx,
image_shapes=image_shapes,
) )
] ]
@@ -545,7 +514,6 @@ class PI052TextTokenizerStep(ProcessorStep):
target_indices: list[int], target_indices: list[int],
complementary: dict[str, Any], complementary: dict[str, Any],
sample_idx: int | None = None, sample_idx: int | None = None,
image_shapes: dict[str, tuple[int, int]] | None = None,
) -> tuple[Tensor, Tensor, Tensor, Tensor, str]: ) -> tuple[Tensor, Tensor, Tensor, Tensor, str]:
# Optional: drop non-target messages per the dropout config. # Optional: drop non-target messages per the dropout config.
# Keeps the supervised-target indices stable by re-mapping # Keeps the supervised-target indices stable by re-mapping
@@ -564,9 +532,9 @@ class PI052TextTokenizerStep(ProcessorStep):
) )
# Rewrite bbox / keypoint VQA target answers from JSON to # Rewrite bbox / keypoint VQA target answers from JSON to
# PaliGemma <loc> text — done before stripping so the image # PaliGemma <loc> text. Coords are 01000 normalized so this is
# block (camera frame) is still available to normalize against. # camera-independent.
messages = _messages_vqa_to_loc(messages, target_indices, image_shapes) messages = _messages_vqa_to_loc(messages, target_indices)
# Flatten ``say`` tool calls into ``<say>...</say>`` text before # Flatten ``say`` tool calls into ``<say>...</say>`` text before
# stripping, so the spoken reply is actually tokenized and # stripping, so the spoken reply is actually tokenized and
+50 -75
View File
@@ -19,8 +19,13 @@
PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in
PaliGemma's native ``<locNNNN>`` detection vocabulary so the LM head PaliGemma's native ``<locNNNN>`` detection vocabulary so the LM head
reuses the detection prior instead of fighting it (the ``<loc>``-salad reuses the detection prior instead of fighting it (the ``<loc>``-salad
bug). The dataset stays backbone-agnostic JSON; the conversion lives in bug). The dataset stores Qwen2.5-VL's grounding output — **01000
PI052's tokenizer. These tests pin the JSON → ``<loc>`` rewrite. normalized** coordinates, *not* pixels. (Verified empirically on the
published datasets: x and y both span 0..1000 with ~30% of values
exceeding the camera's pixel dimensions.) The conversion is therefore
camera-resolution-independent. The dataset stays backbone-agnostic
JSON; the conversion lives in PI052's tokenizer. These tests pin the
JSON ``<loc>`` rewrite.
""" """
import pytest import pytest
@@ -28,80 +33,49 @@ import pytest
pytest.importorskip("transformers") pytest.importorskip("transformers")
from lerobot.policies.pi052.text_processor_pi052 import ( # noqa: E402 from lerobot.policies.pi052.text_processor_pi052 import ( # noqa: E402
_camera_image_shapes,
_loc_token, _loc_token,
_messages_vqa_to_loc, _messages_vqa_to_loc,
_vqa_answer_to_loc, _vqa_answer_to_loc,
) )
class _FakeTensor:
def __init__(self, shape):
self.shape = shape
def test_camera_image_shapes_extracts_hw_from_image_keys():
obs = {
"observation.images.top": _FakeTensor((1, 3, 240, 320)),
"observation.images.wrist": _FakeTensor((3, 480, 640)),
"observation.state": _FakeTensor((1, 7)),
"task": "x",
}
assert _camera_image_shapes(obs) == {
"observation.images.top": (240, 320),
"observation.images.wrist": (480, 640),
}
def test_camera_image_shapes_handles_empty():
assert _camera_image_shapes({}) == {}
assert _camera_image_shapes(None) == {}
def test_loc_token_normalizes_and_clamps(): def test_loc_token_normalizes_and_clamps():
assert _loc_token(0, 100) == "<loc0000>" # Default scale is the 01000 Qwen convention.
assert _loc_token(100, 100) == "<loc1023>" assert _loc_token(0) == "<loc0000>"
assert _loc_token(50, 100) == f"<loc{round(50 / 100 * 1023):04d}>" assert _loc_token(1000) == "<loc1023>"
assert _loc_token(500) == f"<loc{round(500 / 1000 * 1023):04d}>"
# out-of-range coordinates clamp into [0, 1023] # out-of-range coordinates clamp into [0, 1023]
assert _loc_token(999, 100) == "<loc1023>" assert _loc_token(9999) == "<loc1023>"
assert _loc_token(-5, 100) == "<loc0000>" assert _loc_token(-5) == "<loc0000>"
def test_vqa_answer_to_loc_keypoint(): def test_vqa_answer_to_loc_keypoint_normalized():
answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]} # Qwen 01000 normalized coordinates → camera-independent <loc>.
# height=240, width=320 → y=120/240=0.5, x=160/320=0.5 answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]}
out = _vqa_answer_to_loc(answer, height=240, width=320) assert _vqa_answer_to_loc(answer) == "<loc0512><loc0512> blue cube"
assert out == "<loc0512><loc0512> blue cube"
def test_vqa_answer_to_loc_bbox(): def test_vqa_answer_to_loc_bbox_normalized():
answer = { answer = {
"detections": [ "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}]
{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 320, 240]},
]
} }
out = _vqa_answer_to_loc(answer, height=240, width=320) assert _vqa_answer_to_loc(answer) == "<loc0000><loc0000><loc1023><loc1023> cube"
assert out == "<loc0000><loc0000><loc1023><loc1023> cube"
def test_vqa_answer_to_loc_returns_none_for_non_spatial(): def test_vqa_answer_to_loc_returns_none_for_non_spatial():
assert _vqa_answer_to_loc({"label": "cubes", "count": 2}, 240, 320) is None assert _vqa_answer_to_loc({"label": "cubes", "count": 2}) is None
assert _vqa_answer_to_loc({"weird": "payload"}, 240, 320) is None assert _vqa_answer_to_loc({"weird": "payload"}) is None
def test_messages_vqa_to_loc_rewrites_target_turn(): def test_messages_vqa_to_loc_rewrites_target_turn():
messages = [ messages = [
{"role": "user", "content": [{"type": "text", "text": "where is the cube?"}]},
{ {
"role": "user", "role": "assistant",
"content": [ "content": '{"label": "cube", "point_format": "xy", "point": [500, 500]}',
{"type": "image", "feature": "observation.images.top"},
{"type": "text", "text": "where is the cube?"},
],
}, },
{"role": "assistant", "content": '{"label": "cube", "point_format": "xy", "point": [160, 120]}'},
] ]
shapes = {"observation.images.top": (240, 320)} out = _messages_vqa_to_loc(messages, target_indices=[1])
out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
assert out[1]["content"] == "<loc0512><loc0512> cube" assert out[1]["content"] == "<loc0512><loc0512> cube"
# input messages are not mutated # input messages are not mutated
assert messages[1]["content"].startswith("{") assert messages[1]["content"].startswith("{")
@@ -109,50 +83,51 @@ def test_messages_vqa_to_loc_rewrites_target_turn():
def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched(): def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched():
messages = [ messages = [
{"role": "user", "content": [{"type": "image", "feature": "observation.images.top"}]}, {"role": "user", "content": "pick the cube"},
{"role": "assistant", "content": "pick up the cube"}, {"role": "assistant", "content": "pick up the cube"},
] ]
shapes = {"observation.images.top": (240, 320)} out = _messages_vqa_to_loc(messages, target_indices=[1])
out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
assert out[1]["content"] == "pick up the cube" assert out[1]["content"] == "pick up the cube"
def test_messages_vqa_to_loc_noop_without_shapes(): def test_messages_vqa_to_loc_noop_without_target_indices():
messages = [{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}] messages = [
assert _messages_vqa_to_loc(messages, [0], None) is messages {"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}
assert _messages_vqa_to_loc(messages, [0], {}) is messages ]
assert _messages_vqa_to_loc(messages, []) is messages
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Round-trip: training-side JSON -> <loc> -> runtime-side parse back to pixels # Round-trip: training-side JSON -> <loc> -> runtime-side parse back
# #
# Pins that the conversion preserves coordinate *order* (JSON is x-first, # Pins that the conversion preserves coordinate *order* (JSON is x-first,
# PaliGemma <loc> is y-first) and per-axis normalization. The only loss is # PaliGemma <loc> is y-first) and the 01000 → [0, 1023] scaling. The
# quantization to the 1024-bucket <loc> grid, so a pixel survives within # only loss is quantization to the 1024-bucket <loc> grid, so a coord
# half a bucket (~W/2046, H/2046). # survives within half a bucket (~1000/2046 ≈ 0.49 on the 01000 scale).
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_loc_round_trip_keypoint_preserves_pixels(): def test_loc_round_trip_keypoint_preserves_normalized_coords():
from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
h, w = 240, 320 answer = {"label": "blue cube", "point_format": "xy", "point": [640, 480]}
answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]} loc = _vqa_answer_to_loc(answer)
loc = _vqa_answer_to_loc(answer, h, w)
parsed = parse_vqa_answer(loc) parsed = parse_vqa_answer(loc)
nx, ny = parsed["payload"]["point"] nx, ny = parsed["payload"]["point"]
assert abs(nx * w - 160) <= w / 2046 + 1e-6 # parse_vqa_answer returns [0, 1] normalized; rescale back to 01000.
assert abs(ny * h - 120) <= h / 2046 + 1e-6 assert abs(nx * 1000.0 - 640) <= 1000.0 / 2046 + 1e-6
assert abs(ny * 1000.0 - 480) <= 1000.0 / 2046 + 1e-6
assert parsed["payload"]["label"] == "blue cube" assert parsed["payload"]["label"] == "blue cube"
def test_loc_round_trip_bbox_preserves_pixels_and_order(): def test_loc_round_trip_bbox_preserves_order_and_scale():
from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
h, w = 240, 320 answer = {
answer = {"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [32, 24, 288, 216]}]} "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [100, 200, 800, 900]}]
loc = _vqa_answer_to_loc(answer, h, w) }
loc = _vqa_answer_to_loc(answer)
parsed = parse_vqa_answer(loc) parsed = parse_vqa_answer(loc)
x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"] x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"]
for got, want, dim in ((x1, 32, w), (y1, 24, h), (x2, 288, w), (y2, 216, h)): for got, want in ((x1, 100), (y1, 200), (x2, 800), (y2, 900)):
assert abs(got * dim - want) <= dim / 2046 + 1e-6 assert abs(got * 1000.0 - want) <= 1000.0 / 2046 + 1e-6