lerobot/tests/policies/pi052/test_pi052_vqa_loc.py

#!/usr/bin/env python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Training-side conversion of VQA answers to PaliGemma ``<loc>`` text.

PI052 trains spatial VQA answers (``bbox`` / ``keypoint``) in
PaliGemma's native ``<locNNNN>`` detection vocabulary so the LM head
reuses the detection prior instead of fighting it (the ``<loc>``-salad
bug). The dataset stays backbone-agnostic JSON; the conversion lives in
PI052's tokenizer. These tests pin the JSON → ``<loc>`` rewrite.
"""

import pytest

pytest.importorskip("transformers")

from lerobot.policies.pi052.text_processor_pi052 import (  # noqa: E402
    _camera_image_shapes,
    _loc_token,
    _messages_vqa_to_loc,
    _vqa_answer_to_loc,
)


class _FakeTensor:
    def __init__(self, shape):
        self.shape = shape


def test_camera_image_shapes_extracts_hw_from_image_keys():
    obs = {
        "observation.images.top": _FakeTensor((1, 3, 240, 320)),
        "observation.images.wrist": _FakeTensor((3, 480, 640)),
        "observation.state": _FakeTensor((1, 7)),
        "task": "x",
    }
    assert _camera_image_shapes(obs) == {
        "observation.images.top": (240, 320),
        "observation.images.wrist": (480, 640),
    }


def test_camera_image_shapes_handles_empty():
    assert _camera_image_shapes({}) == {}
    assert _camera_image_shapes(None) == {}


def test_loc_token_normalizes_and_clamps():
    assert _loc_token(0, 100) == "<loc0000>"
    assert _loc_token(100, 100) == "<loc1023>"
    assert _loc_token(50, 100) == f"<loc{round(50 / 100 * 1023):04d}>"
    # out-of-range coordinates clamp into [0, 1023]
    assert _loc_token(999, 100) == "<loc1023>"
    assert _loc_token(-5, 100) == "<loc0000>"


def test_vqa_answer_to_loc_keypoint():
    answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
    # height=240, width=320 → y=120/240=0.5, x=160/320=0.5
    out = _vqa_answer_to_loc(answer, height=240, width=320)
    assert out == "<loc0512><loc0512> blue cube"


def test_vqa_answer_to_loc_bbox():
    answer = {
        "detections": [
            {"label": "cube", "bbox_format": "xyxy", "bbox": [0, 0, 320, 240]},
        ]
    }
    out = _vqa_answer_to_loc(answer, height=240, width=320)
    assert out == "<loc0000><loc0000><loc1023><loc1023> cube"


def test_vqa_answer_to_loc_returns_none_for_non_spatial():
    assert _vqa_answer_to_loc({"label": "cubes", "count": 2}, 240, 320) is None
    assert _vqa_answer_to_loc({"weird": "payload"}, 240, 320) is None


def test_messages_vqa_to_loc_rewrites_target_turn():
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "feature": "observation.images.top"},
                {"type": "text", "text": "where is the cube?"},
            ],
        },
        {"role": "assistant", "content": '{"label": "cube", "point_format": "xy", "point": [160, 120]}'},
    ]
    shapes = {"observation.images.top": (240, 320)}
    out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
    assert out[1]["content"] == "<loc0512><loc0512> cube"
    # input messages are not mutated
    assert messages[1]["content"].startswith("{")


def test_messages_vqa_to_loc_leaves_plain_text_targets_untouched():
    messages = [
        {"role": "user", "content": [{"type": "image", "feature": "observation.images.top"}]},
        {"role": "assistant", "content": "pick up the cube"},
    ]
    shapes = {"observation.images.top": (240, 320)}
    out = _messages_vqa_to_loc(messages, target_indices=[1], image_shapes=shapes)
    assert out[1]["content"] == "pick up the cube"


def test_messages_vqa_to_loc_noop_without_shapes():
    messages = [{"role": "assistant", "content": '{"label": "c", "point_format": "xy", "point": [1, 2]}'}]
    assert _messages_vqa_to_loc(messages, [0], None) is messages
    assert _messages_vqa_to_loc(messages, [0], {}) is messages


# ---------------------------------------------------------------------------
# Round-trip: training-side JSON -> <loc> -> runtime-side parse back to pixels
#
# Pins that the conversion preserves coordinate *order* (JSON is x-first,
# PaliGemma <loc> is y-first) and per-axis normalization. The only loss is
# quantization to the 1024-bucket <loc> grid, so a pixel survives within
# half a bucket (~W/2046, H/2046).
# ---------------------------------------------------------------------------


def test_loc_round_trip_keypoint_preserves_pixels():
    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer

    h, w = 240, 320
    answer = {"label": "blue cube", "point_format": "xy", "point": [160, 120]}
    loc = _vqa_answer_to_loc(answer, h, w)
    parsed = parse_vqa_answer(loc)
    nx, ny = parsed["payload"]["point"]
    assert abs(nx * w - 160) <= w / 2046 + 1e-6
    assert abs(ny * h - 120) <= h / 2046 + 1e-6
    assert parsed["payload"]["label"] == "blue cube"


def test_loc_round_trip_bbox_preserves_pixels_and_order():
    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer

    h, w = 240, 320
    answer = {"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [32, 24, 288, 216]}]}
    loc = _vqa_answer_to_loc(answer, h, w)
    parsed = parse_vqa_answer(loc)
    x1, y1, x2, y2 = parsed["payload"]["detections"][0]["bbox"]
    for got, want, dim in ((x1, 32, w), (y1, 24, h), (x2, 288, w), (y2, 216, h)):
        assert abs(got * dim - want) <= dim / 2046 + 1e-6