diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py index 8f59e2eaa..6e0e8bff6 100644 --- a/src/lerobot/policies/pi052/text_processor_pi052.py +++ b/src/lerobot/policies/pi052/text_processor_pi052.py @@ -240,45 +240,35 @@ def _sample_indices(value: Any, batch_size: int) -> list[int | None]: # # PaliGemma is pre-trained on detection / pointing with a ```` # vocabulary (normalized [0, 1023]). The recipe's bbox / keypoint VQA -# answers are stored as JSON with *pixel* coordinates. Training those in -# ```` form leverages PaliGemma's prior instead of fighting it (the -# ````-token salad). The conversion lives here — not in the dataset -# — so the dataset stays backbone-agnostic (SmolVLA2 keeps the JSON). +# answers are stored as JSON in Qwen2.5-VL's grounding convention: +# **0–1000 normalized coordinates**, NOT pixels. (Verified empirically +# on the published datasets: x and y both span 0..1000 with ~30% of +# values exceeding the camera's pixel dimensions — they're not pixels.) +# Converting to ```` is therefore camera-resolution-independent: +# ``loc_idx = round(coord / 1000 * 1023)``. We do the conversion here — +# not in the dataset — so the dataset stays backbone-agnostic (SmolVLA2 +# keeps the JSON). # --------------------------------------------------------------------------- - -def _camera_image_shapes(observation: dict[str, Any]) -> dict[str, tuple[int, int]]: - """Map each ``observation.images.*`` key to its native ``(height, width)``. - - VQA pixel coordinates are relative to the camera frame's native - resolution. PI052's input pipeline applies no spatial resize before - this step, so the observation image tensors are still at that - resolution — the correct reference for normalizing to ````. - """ - shapes: dict[str, tuple[int, int]] = {} - for key, value in (observation or {}).items(): - if not (isinstance(key, str) and key.startswith("observation.images.")): - continue - shape = getattr(value, "shape", None) - if shape is None or len(shape) < 2: - continue - shapes[key] = (int(shape[-2]), int(shape[-1])) # (H, W); handles (B,C,H,W)/(C,H,W) - return shapes +# The 0–1000 scale Qwen2.5-VL emits for grounding coordinates. +_VQA_COORD_SCALE = 1000.0 -def _loc_token(coord: float, dim: int) -> str: - """PaliGemma ```` for pixel ``coord`` on an axis of size ``dim``.""" - idx = round(float(coord) / dim * 1023) if dim > 0 else 0 +def _loc_token(coord: float, scale: float = _VQA_COORD_SCALE) -> str: + """PaliGemma ```` for a coord on a ``[0, scale]`` axis.""" + idx = round(float(coord) / scale * 1023) if scale > 0 else 0 return f"" -def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | None: +def _vqa_answer_to_loc(answer: dict[str, Any]) -> str | None: """Convert a bbox / keypoint VQA answer dict to PaliGemma ```` text. - PaliGemma convention: a point is `` label``; a box is - `` label`` (y before x, each index in - [0, 1023]). Returns ``None`` for non-spatial answers (count / - attribute / spatial-relation) — those keep their JSON form. + Input coordinates are in Qwen2.5-VL's 0–1000 normalized space (see + module-level note). PaliGemma convention: a point is + `` label``; a box is `` label`` + (y before x, each index in [0, 1023]). Returns ``None`` for + non-spatial answers (count / attribute / spatial-relation) — those + keep their JSON form. """ point = answer.get("point") if isinstance(point, list | tuple) and len(point) == 2 and "point_format" in answer: @@ -287,7 +277,7 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | except (TypeError, ValueError): return None label = str(answer.get("label", "")).strip() - return f"{_loc_token(y, height)}{_loc_token(x, width)} {label}".strip() + return f"{_loc_token(y)}{_loc_token(x)} {label}".strip() detections = answer.get("detections") if isinstance(detections, list) and detections: @@ -304,41 +294,26 @@ def _vqa_answer_to_loc(answer: dict[str, Any], height: int, width: int) -> str | continue label = str(det.get("label", "")).strip() toks = ( - f"{_loc_token(y1, height)}{_loc_token(x1, width)}" - f"{_loc_token(y2, height)}{_loc_token(x2, width)}" + f"{_loc_token(y1)}{_loc_token(x1)}" + f"{_loc_token(y2)}{_loc_token(x2)}" ) parts.append(f"{toks} {label}".strip()) return " ; ".join(parts) if parts else None return None -def _preceding_image_feature(messages: list[dict[str, Any]], idx: int) -> str | None: - """Camera ``feature`` of the nearest image block at or before ``idx``.""" - for j in range(min(idx, len(messages) - 1), -1, -1): - content = messages[j].get("content") - if not isinstance(content, list): - continue - for block in content: - if isinstance(block, dict) and block.get("type") == "image": - feature = block.get("feature") - if isinstance(feature, str): - return feature - return None - - def _messages_vqa_to_loc( messages: list[dict[str, Any]], target_indices: list[int], - image_shapes: dict[str, tuple[int, int]] | None, ) -> list[dict[str, Any]]: """Rewrite bbox / keypoint VQA *target* answers from JSON to ```` text. Each target turn whose content parses as a spatial VQA answer is - converted, using the camera frame found from the preceding image - block. Non-spatial answers, subtask / memory targets (plain text → - not JSON), and turns with no matching image shape are left untouched. + converted. Non-spatial answers and subtask / memory targets (plain + text → not JSON) are left untouched. Camera-independent: VQA coords + are 0–1000 normalized, so no observation lookup is needed. """ - if not image_shapes or not target_indices: + if not target_indices: return messages out = list(messages) for idx in target_indices: @@ -353,11 +328,7 @@ def _messages_vqa_to_loc( continue # subtask / memory targets are plain text — skip if not isinstance(answer, dict): continue - feature = _preceding_image_feature(out, idx) - if feature is None or feature not in image_shapes: - continue - h, w = image_shapes[feature] - loc_text = _vqa_answer_to_loc(answer, h, w) + loc_text = _vqa_answer_to_loc(answer) if loc_text is not None: out[idx] = {**out[idx], "content": loc_text} return out @@ -458,9 +429,9 @@ class PI052TextTokenizerStep(ProcessorStep): return transition tokenizer = self._ensure_tokenizer() - # Native camera resolutions — the reference frame for converting - # VQA pixel coordinates to PaliGemma tokens. - image_shapes = _camera_image_shapes(transition.get(TransitionKey.OBSERVATION) or {}) + # VQA coords are 0–1000 normalized (Qwen2.5-VL convention) — the + # conversion is camera-resolution-independent and needs no + # observation lookup here. if _is_batched_messages(messages): indices_iter = _sample_indices(complementary.get("index"), len(messages)) encoded = [ @@ -471,7 +442,6 @@ class PI052TextTokenizerStep(ProcessorStep): list(tgt_indices), complementary, sample_idx=int(s_idx) if s_idx is not None else None, - image_shapes=image_shapes, ) for msg, streams, tgt_indices, s_idx in zip( messages, @@ -491,7 +461,6 @@ class PI052TextTokenizerStep(ProcessorStep): list(complementary.get("target_message_indices") or []), complementary, sample_idx=sample_idx, - image_shapes=image_shapes, ) ] @@ -545,7 +514,6 @@ class PI052TextTokenizerStep(ProcessorStep): target_indices: list[int], complementary: dict[str, Any], sample_idx: int | None = None, - image_shapes: dict[str, tuple[int, int]] | None = None, ) -> tuple[Tensor, Tensor, Tensor, Tensor, str]: # Optional: drop non-target messages per the dropout config. # Keeps the supervised-target indices stable by re-mapping @@ -564,9 +532,9 @@ class PI052TextTokenizerStep(ProcessorStep): ) # Rewrite bbox / keypoint VQA target answers from JSON to - # PaliGemma text — done before stripping so the image - # block (camera frame) is still available to normalize against. - messages = _messages_vqa_to_loc(messages, target_indices, image_shapes) + # PaliGemma text. Coords are 0–1000 normalized so this is + # camera-independent. + messages = _messages_vqa_to_loc(messages, target_indices) # Flatten ``say`` tool calls into ``...`` text before # stripping, so the spoken reply is actually tokenized and diff --git a/tests/policies/pi052/test_pi052_vqa_loc.py b/tests/policies/pi052/test_pi052_vqa_loc.py index bb96ca1ab..a51452a08 100644 --- a/tests/policies/pi052/test_pi052_vqa_loc.py +++ b/tests/policies/pi052/test_pi052_vqa_loc.py @@ -19,8 +19,13 @@ PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in PaliGemma's native ```` detection vocabulary so the LM head reuses the detection prior instead of fighting it (the ````-salad -bug). The dataset stays backbone-agnostic JSON; the conversion lives in -PI052's tokenizer. These tests pin the JSON → ```` rewrite. +bug). The dataset stores Qwen2.5-VL's grounding output — **0–1000 +normalized** coordinates, *not* pixels. (Verified empirically on the +published datasets: x and y both span 0..1000 with ~30% of values +exceeding the camera's pixel dimensions.) The conversion is therefore +camera-resolution-independent. The dataset stays backbone-agnostic +JSON; the conversion lives in PI052's tokenizer. These tests pin the +JSON → ```` rewrite. """ import pytest @@ -28,80 +33,49 @@ import pytest pytest.importorskip("transformers") from lerobot.policies.pi052.text_processor_pi052 import ( # noqa: E402 - _camera_image_shapes, _loc_token, _messages_vqa_to_loc, _vqa_answer_to_loc, ) -class _FakeTensor: - def __init__(self, shape): - self.shape = shape - - -def test_camera_image_shapes_extracts_hw_from_image_keys(): - obs = { - "observation.images.top": _FakeTensor((1, 3, 240, 320)), - "observation.images.wrist": _FakeTensor((3, 480, 640)), - "observation.state": _FakeTensor((1, 7)), - "task": "x", - } - assert _camera_image_shapes(obs) == { - "observation.images.top": (240, 320), - "observation.images.wrist": (480, 640), - } - - -def test_camera_image_shapes_handles_empty(): - assert _camera_image_shapes({}) == {} - assert _camera_image_shapes(None) == {} - - def test_loc_token_normalizes_and_clamps(): - assert _loc_token(0, 100) == "" - assert _loc_token(100, 100) == "" - assert _loc_token(50, 100) == f"" + # Default scale is the 0–1000 Qwen convention. + assert _loc_token(0) == "" + assert _loc_token(1000) == "" + assert _loc_token(500) == f"" # out-of-range coordinates clamp into [0, 1023] - assert _loc_token(999, 100) == "" - assert _loc_token(-5, 100) == "" + assert _loc_token(9999) == "" + assert _loc_token(-5) == "" -def test_vqa_answer_to_loc_keypoint(): - answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]} - # height=240, width=320 → y=120/240=0.5, x=160/320=0.5 - out = _vqa_answer_to_loc(answer, height=240, width=320) - assert out == " blue cube" +def test_vqa_answer_to_loc_keypoint_normalized(): + # Qwen 0–1000 normalized coordinates → camera-independent . + answer = {"label": "blue cube", "point_format": "xy", "point": [500, 500]} + assert _vqa_answer_to_loc(answer) == " blue cube" -def test_vqa_answer_to_loc_bbox(): +def test_vqa_answer_to_loc_bbox_normalized(): answer = { - "detections": [ - {"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 320, 240]}, - ] + "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 1000, 1000]}] } - out = _vqa_answer_to_loc(answer, height=240, width=320) - assert out == " cube" + assert _vqa_answer_to_loc(answer) == " cube" def test_vqa_answer_to_loc_returns_none_for_non_spatial(): - assert _vqa_answer_to_loc({"label": "cubes", "count": 2}, 240, 320) is None - assert _vqa_answer_to_loc({"weird": "payload"}, 240, 320) is None + assert _vqa_answer_to_loc({"label": "cubes", "count": 2}) is None + assert _vqa_answer_to_loc({"weird": "payload"}) is None def test_messages_vqa_to_loc_rewrites_target_turn(): messages = [ + {"role": "user", "content": [{"type": "text", "text": "where is the cube?"}]}, { - "role": "user", - "content": [ - {"type": "image", "feature": "observation.images.top"}, - {"type": "text", "text": "where is the cube?"}, - ], + "role": "assistant", + "content": '{"label": "cube", "point_format": "xy", "point": [500, 500]}', }, - {"role": "assistant", "content": '{"label": "cube", "point_format": "xy", "point": [160, 120]}'}, ] - shapes = {"observation.images.top": (240, 320)} - out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes) + out = _messages_vqa_to_loc(messages, target_indices=[1]) assert out[1]["content"] == " cube" # input messages are not mutated assert messages[1]["content"].startswith("{") @@ -109,50 +83,51 @@ def test_messages_vqa_to_loc_rewrites_target_turn(): def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched(): messages = [ - {"role": "user", "content": [{"type": "image", "feature": "observation.images.top"}]}, + {"role": "user", "content": "pick the cube"}, {"role": "assistant", "content": "pick up the cube"}, ] - shapes = {"observation.images.top": (240, 320)} - out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes) + out = _messages_vqa_to_loc(messages, target_indices=[1]) assert out[1]["content"] == "pick up the cube" -def test_messages_vqa_to_loc_noop_without_shapes(): - messages = [{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}] - assert _messages_vqa_to_loc(messages, [0], None) is messages - assert _messages_vqa_to_loc(messages, [0], {}) is messages +def test_messages_vqa_to_loc_noop_without_target_indices(): + messages = [ + {"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'} + ] + assert _messages_vqa_to_loc(messages, []) is messages # --------------------------------------------------------------------------- -# Round-trip: training-side JSON -> -> runtime-side parse back to pixels +# Round-trip: training-side JSON -> -> runtime-side parse back # # Pins that the conversion preserves coordinate *order* (JSON is x-first, -# PaliGemma is y-first) and per-axis normalization. The only loss is -# quantization to the 1024-bucket grid, so a pixel survives within -# half a bucket (~W/2046, H/2046). +# PaliGemma is y-first) and the 0–1000 → [0, 1023] scaling. The +# only loss is quantization to the 1024-bucket grid, so a coord +# survives within half a bucket (~1000/2046 ≈ 0.49 on the 0–1000 scale). # --------------------------------------------------------------------------- -def test_loc_round_trip_keypoint_preserves_pixels(): +def test_loc_round_trip_keypoint_preserves_normalized_coords(): from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer - h, w = 240, 320 - answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]} - loc = _vqa_answer_to_loc(answer, h, w) + answer = {"label": "blue cube", "point_format": "xy", "point": [640, 480]} + loc = _vqa_answer_to_loc(answer) parsed = parse_vqa_answer(loc) nx, ny = parsed["payload"]["point"] - assert abs(nx * w - 160) <= w / 2046 + 1e-6 - assert abs(ny * h - 120) <= h / 2046 + 1e-6 + # parse_vqa_answer returns [0, 1] normalized; rescale back to 0–1000. + assert abs(nx * 1000.0 - 640) <= 1000.0 / 2046 + 1e-6 + assert abs(ny * 1000.0 - 480) <= 1000.0 / 2046 + 1e-6 assert parsed["payload"]["label"] == "blue cube" -def test_loc_round_trip_bbox_preserves_pixels_and_order(): +def test_loc_round_trip_bbox_preserves_order_and_scale(): from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer - h, w = 240, 320 - answer = {"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [32, 24, 288, 216]}]} - loc = _vqa_answer_to_loc(answer, h, w) + answer = { + "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [100, 200, 800, 900]}] + } + loc = _vqa_answer_to_loc(answer) parsed = parse_vqa_answer(loc) x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"] - for got, want, dim in ((x1, 32, w), (y1, 24, h), (x2, 288, w), (y2, 216, h)): - assert abs(got * dim - want) <= dim / 2046 + 1e-6 + for got, want in ((x1, 100), (y1, 200), (x2, 800), (y2, 900)): + assert abs(got * 1000.0 - want) <= 1000.0 / 2046 + 1e-6