examples(port_datasets): SLURM+datatrove RoboCasa composite_seen build

Parallel variant of build_robocasa_composite_seen.py modeled after the existing slurm_port_shards.py / slurm_aggregate_shards.py pattern. Two-phase datatrove pipeline: * Phase 1 DOWNLOAD: tasks=16 (one per RoboCasa composite_seen task), each worker downloads its assigned tar via RoboCasa's own download_datasets helper. Network-bound, idempotent. * Phase 2 AGGREGATE: tasks=1, single worker calls aggregate_datasets over the 16 extracted directories. Submitted with depends=phase1 so SLURM only releases it once all 16 downloads succeed. Reuses the COMPOSITE_SEEN_TASKS list and per-task download/resolve helpers from the single-machine script via aliased imports — single source of truth for 'what does it mean to download a composite_seen task'. Local (--slurm 0) mode runs the two phases sequentially in-process for debugging on a workstation. Usage on SLURM: uv run python examples/port_datasets/slurm_build_robocasa_composite_seen.py \ --output-dir=/scratch/${USER}/robocasa_composite_seen \ --hub-repo-id=${HF_USER}/robocasa_composite_seen \ --logs-dir=/scratch/${USER}/logs/robocasa \ --partition=cpu --push-to-hub Prereq: uv sync --extra annotations (pulls datatrove) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 02:36:11 +00:00 · 2026-05-25 14:10:05 +02:00
parent 9c3d5ab7ce
commit a088c10c80
31 changed files with 666 additions and 2432 deletions
@@ -162,7 +162,7 @@ def test_messages_vqa_to_loc_noop_without_target_indices():


 def test_loc_round_trip_keypoint_preserves_normalized_coords():
-    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
+    from lerobot.policies.pi052.inference.vqa import parse_vqa_answer

    answer = {"label": "blue cube", "point_format": "xy", "point": [640, 480]}
    loc = _vqa_answer_to_loc(answer)
@@ -175,7 +175,7 @@ def test_loc_round_trip_keypoint_preserves_normalized_coords():


 def test_loc_round_trip_bbox_preserves_order_and_scale():
-    from lerobot.policies.smolvla2.inference.vqa import parse_vqa_answer
+    from lerobot.policies.pi052.inference.vqa import parse_vqa_answer

    answer = {
        "detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [100, 200, 800, 900]}]
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Attention-masking tests for the SmolVLA2 text head.
-
-Regression coverage for the text-CE collapse bug: ``embed_prefix`` flags
-every language token ``att=0``, which ``make_att_2d_masks`` turns into a
-single fully *bidirectional* block. Under that mask the text
-cross-entropy degenerates into a copy task — a supervised target token
-attends to the tokens it is trained to predict — and the model never
-learns causal generation, so ``select_message`` collapses at inference.
-
-``_mark_target_span_causal`` sets ``att=1`` on the supervised target
-language positions so each target token attends causally among the
-targets while staying bidirectional to images + the user prompt. These
-tests pin that behaviour.
-"""
-
-import pytest
-import torch
-
-# The smolvla2 modeling module imports transformers transitively.
-pytest.importorskip("transformers")
-
-from lerobot.policies.smolvla.modeling_smolvla import make_att_2d_masks  # noqa: E402
-from lerobot.policies.smolvla2.modeling_smolvla2 import (  # noqa: E402
-    _locate_lang_range,
-    _mark_target_span_causal,
-)
-
-# ---------------------------------------------------------------------------
-# A synthetic SmolVLA prefix layout: [images, prompt-lang, target-lang, state]
-#
-#   indices 0-1  : 2 image tokens          (att = 0)
-#   indices 2-4  : 3 user-prompt lang      (att = 0)
-#   indices 5-8  : 4 supervised target lang(att = 0 from embed_prefix)
-#   index   9    : 1 state token           (att = 1)
-#
-# ``text_labels`` covers the 7 language tokens; -100 on the prompt span,
-# real ids on the 4-token target span.
-# ---------------------------------------------------------------------------
-N_IMAGE = 2
-N_PROMPT = 3
-N_TARGET = 4
-LANG_START = N_IMAGE
-LANG_END = N_IMAGE + N_PROMPT + N_TARGET  # = state-token index
-PREFIX_LEN = LANG_END + 1
-
-
-def _embed_prefix_att_masks() -> torch.Tensor:
-    """Mimic ``embed_prefix``: images + lang all att=0, state att=1."""
-    att = torch.zeros(1, PREFIX_LEN, dtype=torch.bool)
-    att[0, LANG_END] = True  # the single state token
-    return att
-
-
-def _text_labels() -> torch.Tensor:
-    """-100 over the prompt span, real ids over the target span."""
-    labels = torch.full((1, N_PROMPT + N_TARGET), -100, dtype=torch.long)
-    labels[0, N_PROMPT:] = torch.arange(10, 10 + N_TARGET)
-    return labels
-
-
-def _attends(prefix_att_masks: torch.Tensor) -> torch.Tensor:
-    """2D boolean attendance matrix; ``[i, j]`` True ⇒ i attends to j."""
-    pad = torch.ones(1, PREFIX_LEN, dtype=torch.bool)
-    return make_att_2d_masks(pad, prefix_att_masks)[0]
-
-
-def test_locate_lang_range_anchors_on_state_token():
-    """``_locate_lang_range`` finds the lang span via the lone att=1 token."""
-    lang_start, lang_end = _locate_lang_range(
-        _embed_prefix_att_masks(), num_lang=N_PROMPT + N_TARGET
-    )
-    assert (lang_start, lang_end) == (LANG_START, LANG_END)
-
-
-def test_mark_sets_att_on_targets_only():
-    """Only the supervised target language positions flip to att=1."""
-    marked = _mark_target_span_causal(
-        _embed_prefix_att_masks(), _text_labels(), LANG_START, LANG_END
-    )
-    expected = [False] * PREFIX_LEN
-    for i in range(LANG_START + N_PROMPT, LANG_END):  # target span
-        expected[i] = True
-    expected[LANG_END] = True  # state token, untouched
-    assert marked[0].tolist() == expected
-
-
-def test_target_tokens_attend_causally_among_themselves():
-    """A target token must NOT attend to later targets, but must attend
-    to earlier ones — i.e. genuine causal next-token prediction."""
-    marked = _mark_target_span_causal(
-        _embed_prefix_att_masks(), _text_labels(), LANG_START, LANG_END
-    )
-    attends = _attends(marked)
-    tgt = range(LANG_START + N_PROMPT, LANG_END)
-    for i in tgt:
-        for j in tgt:
-            if j > i:
-                assert not attends[i, j], f"target {i} must not see future target {j}"
-            else:
-                assert attends[i, j], f"target {i} must see earlier/self target {j}"
-
-
-def test_target_tokens_attend_prompt_and_images_bidirectionally():
-    """Targets keep full visibility of images + the user prompt."""
-    marked = _mark_target_span_causal(
-        _embed_prefix_att_masks(), _text_labels(), LANG_START, LANG_END
-    )
-    attends = _attends(marked)
-    context = list(range(0, LANG_START + N_PROMPT))  # images + prompt
-    for i in range(LANG_START + N_PROMPT, LANG_END):
-        for j in context:
-            assert attends[i, j], f"target {i} must attend context {j}"
-
-
-def test_action_expert_token_still_sees_full_subtask():
-    """The state token (action-expert context) attends to every target —
-    causal masking the targets must not hide them from the action path."""
-    marked = _mark_target_span_causal(
-        _embed_prefix_att_masks(), _text_labels(), LANG_START, LANG_END
-    )
-    attends = _attends(marked)
-    for j in range(LANG_START + N_PROMPT, LANG_END):
-        assert attends[LANG_END, j], f"state token must see target {j}"
-
-
-def test_non_target_subtask_stays_bidirectional():
-    """``low_level_execution`` renders the subtask as a user turn — its
-    ``text_labels`` are all -100, so the mask must be left untouched and
-    the action expert reads the subtask bidirectionally."""
-    all_ignored = torch.full((1, N_PROMPT + N_TARGET), -100, dtype=torch.long)
-    marked = _mark_target_span_causal(
-        _embed_prefix_att_masks(), all_ignored, LANG_START, LANG_END
-    )
-    assert torch.equal(marked, _embed_prefix_att_masks())
-
-
-def test_unmarked_mask_is_bidirectional_the_bug():
-    """Documents the bug the fix prevents: without ``_mark_target_span_causal``
-    a target token attends *bidirectionally* to later targets — the
-    text-CE can copy the answer it is trained to predict."""
-    attends = _attends(_embed_prefix_att_masks())
-    first_tgt = LANG_START + N_PROMPT
-    last_tgt = LANG_END - 1
-    assert attends[first_tgt, last_tgt], (
-        "raw embed_prefix mask is bidirectional over language — the first "
-        "target token can see the last, which is the collapse bug"
-    )
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for SmolVLA2's chat-tokenizer ``tool_calls`` flattening.
-
-``_split_plan_and_say`` (inference) expects the model to emit a textual
-``<say>...</say>`` marker. ``_flatten_say_tool_calls`` is the training-time
-serializer that produces it: it rewrites an assistant turn's structured
-``say`` tool call into that marker *inside the content text*, before
-``apply_chat_template`` runs — so the chat template only tokenizes plain
-text and the supervised target span trains the model to emit the marker
-the runtime parses back. These tests pin the round-trip.
-"""
-
-from lerobot.policies.smolvla2.chat_processor_smolvla2 import flatten_say_tool_calls
-from lerobot.policies.smolvla2.inference.steps import _split_plan_and_say
-
-
-def _say_call(text):
-    return {"type": "function", "function": {"name": "say", "arguments": {"text": text}}}
-
-
-def test_flatten_appends_say_marker_and_drops_tool_calls():
-    msg = {"role": "assistant", "content": "Pick up the blue cube.", "tool_calls": [_say_call("On it!")]}
-    out = flatten_say_tool_calls(msg)
-    assert "tool_calls" not in out
-    assert out["content"] == "Pick up the blue cube.\n<say>On it!</say>"
-
-
-def test_flatten_roundtrips_through_inference_parser():
-    """The marker the serializer writes must be exactly what the inference
-    parser reads back — this is the train/inference contract."""
-    msg = {"role": "assistant", "content": "Move toward the cube.", "tool_calls": [_say_call("Working on it")]}
-    flat = flatten_say_tool_calls(msg)["content"]
-    plan, speech = _split_plan_and_say(flat)
-    assert plan == "Move toward the cube."
-    assert speech == "Working on it"
-
-
-def test_flatten_accepts_json_string_arguments():
-    """``arguments`` may arrive as a JSON string rather than a dict."""
-    call = {"type": "function", "function": {"name": "say", "arguments": '{"text": "hello there"}'}}
-    out = flatten_say_tool_calls({"role": "assistant", "content": "p", "tool_calls": [call]})
-    assert out["content"] == "p\n<say>hello there</say>"
-
-
-def test_flatten_leaves_messages_without_tool_calls_untouched():
-    msg = {"role": "assistant", "content": "just a plan"}
-    assert flatten_say_tool_calls(msg) == msg
-
-
-def test_flatten_drops_empty_or_non_say_tool_calls():
-    """A non-``say`` call (or empty text) leaves content alone but still
-    strips the structured calls so the template renders no JSON block."""
-    weather = {"type": "function", "function": {"name": "check_weather", "arguments": {}}}
-    out = flatten_say_tool_calls({"role": "assistant", "content": "plan only", "tool_calls": [weather]})
-    assert out["content"] == "plan only"
-    assert "tool_calls" not in out
-
-
-def test_flatten_marker_only_when_content_empty():
-    msg = {"role": "assistant", "content": "", "tool_calls": [_say_call("hi")]}
-    out = flatten_say_tool_calls(msg)
-    assert out["content"] == "<say>hi</say>"
@@ -1,228 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for the SmolVLA2 runtime's interactive-VQA helpers.
-
-Covers camera selection, VQA-answer parsing, and the bounding-box /
-keypoint overlay drawing — the pure functions, no model load.
-"""
-
-import numpy as np
-import pytest
-
-from lerobot.policies.smolvla2.inference.vqa import (
-    answer_has_overlay,
-    available_cameras,
-    camera_short_name,
-    draw_vqa_overlay,
-    observation_image_to_pil,
-    parse_vqa_answer,
-    prompt_camera_choice,
-)
-
-PIL = pytest.importorskip("PIL")
-from PIL import Image  # noqa: E402
-
-# ---------------------------------------------------------------------------
-# Camera selection
-# ---------------------------------------------------------------------------
-
-
-def test_available_cameras_extracts_and_sorts_image_keys():
-    observation = {
-        "observation.images.wrist": object(),
-        "observation.state": object(),
-        "observation.images.top": object(),
-        "task": "x",
-    }
-    assert available_cameras(observation) == [
-        "observation.images.top",
-        "observation.images.wrist",
-    ]
-
-
-def test_available_cameras_handles_none_and_empty():
-    assert available_cameras(None) == []
-    assert available_cameras({}) == []
-
-
-def test_camera_short_name_strips_prefix():
-    assert camera_short_name("observation.images.top") == "top"
-    assert camera_short_name("top") == "top"
-
-
-def test_prompt_camera_choice_single_camera_auto_selects():
-    cams = ["observation.images.top"]
-    # input_fn must never be called for a single-camera setup.
-    chosen = prompt_camera_choice(cams, input_fn=_boom, print_fn=lambda *_: None)
-    assert chosen == "observation.images.top"
-
-
-def test_prompt_camera_choice_by_number():
-    cams = ["observation.images.top", "observation.images.wrist"]
-    chosen = prompt_camera_choice(cams, input_fn=lambda _: "2", print_fn=lambda *_: None)
-    assert chosen == "observation.images.wrist"
-
-
-def test_prompt_camera_choice_by_name():
-    cams = ["observation.images.top", "observation.images.wrist"]
-    chosen = prompt_camera_choice(cams, input_fn=lambda _: "top", print_fn=lambda *_: None)
-    assert chosen == "observation.images.top"
-
-
-def test_prompt_camera_choice_invalid_returns_none():
-    cams = ["observation.images.top", "observation.images.wrist"]
-    assert prompt_camera_choice(cams, input_fn=lambda _: "99", print_fn=lambda *_: None) is None
-
-
-def _boom(*_args, **_kwargs):
-    raise AssertionError("input_fn should not be called")
-
-
-# ---------------------------------------------------------------------------
-# Answer parsing
-# ---------------------------------------------------------------------------
-
-
-def test_parse_bbox_answer():
-    answer = '{"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [10, 20, 50, 80]}]}'
-    parsed = parse_vqa_answer(answer)
-    assert parsed["kind"] == "bbox"
-    assert answer_has_overlay(parsed)
-
-
-def test_parse_keypoint_answer():
-    answer = '{"label": "blue cube", "point_format": "xy", "point": [120, 90]}'
-    parsed = parse_vqa_answer(answer)
-    assert parsed["kind"] == "keypoint"
-    assert answer_has_overlay(parsed)
-
-
-def test_parse_count_answer_is_not_an_overlay():
-    parsed = parse_vqa_answer('{"label": "cubes", "count": 2}')
-    assert parsed["kind"] == "count"
-    assert not answer_has_overlay(parsed)
-
-
-def test_parse_invalid_json_returns_none():
-    assert parse_vqa_answer("not json at all") is None
-    assert parse_vqa_answer("") is None
-    # A JSON array is valid JSON but not a VQA answer object.
-    assert parse_vqa_answer("[1, 2, 3]") is None
-
-
-def test_parse_unknown_shape():
-    parsed = parse_vqa_answer('{"weird": "payload"}')
-    assert parsed["kind"] == "unknown"
-    assert not answer_has_overlay(parsed)
-
-
-# ---------------------------------------------------------------------------
-# Overlay drawing
-# ---------------------------------------------------------------------------
-
-
-def _blank(size=(160, 120)):
-    return Image.new("RGB", size, (0, 0, 0))
-
-
-def test_draw_bbox_overlay_changes_pixels_and_preserves_size():
-    img = _blank()
-    parsed = parse_vqa_answer(
-        '{"detections": [{"label": "cube", "bbox_format": "xyxy", "bbox": [10, 20, 50, 80]}]}'
-    )
-    out = draw_vqa_overlay(img, parsed)
-    assert out.size == img.size
-    assert out.tobytes() != img.tobytes()
-
-
-def test_draw_keypoint_overlay_changes_pixels():
-    img = _blank()
-    parsed = parse_vqa_answer('{"label": "cube", "point_format": "xy", "point": [80, 60]}')
-    out = draw_vqa_overlay(img, parsed)
-    assert out.size == img.size
-    assert out.tobytes() != img.tobytes()
-
-
-def test_draw_overlay_non_spatial_leaves_image_unchanged():
-    img = _blank()
-    parsed = parse_vqa_answer('{"label": "cubes", "count": 2}')
-    out = draw_vqa_overlay(img, parsed)
-    assert out.tobytes() == img.tobytes()
-
-
-def test_draw_overlay_tolerates_malformed_coordinates():
-    img = _blank()
-    # bbox with the wrong arity must not raise.
-    out = draw_vqa_overlay(img, {"kind": "bbox", "payload": {"detections": [{"bbox": [1, 2]}]}})
-    assert out.size == img.size
-
-
-def test_observation_image_to_pil_from_batched_float_array():
-    # (1, C, H, W) float array in [0, 1], the runtime observation shape.
-    arr = np.zeros((1, 3, 24, 32), dtype=np.float32)
-    pil = observation_image_to_pil(arr)
-    assert pil.size == (32, 24)
-    assert pil.mode == "RGB"
-
-
-# ---------------------------------------------------------------------------
-# PaliGemma <loc>-format answers (PI052 trains spatial VQA in this vocab)
-# ---------------------------------------------------------------------------
-
-
-def test_parse_loc_keypoint_answer():
-    # <locY><locX> label — y=512/1023≈0.5, x=256/1023≈0.25
-    parsed = parse_vqa_answer("<loc0512><loc0256> blue cube")
-    assert parsed["kind"] == "keypoint"
-    assert parsed["normalized"] is True
-    x, y = parsed["payload"]["point"]
-    assert 0.24 < x < 0.26
-    assert 0.49 < y < 0.51
-    assert parsed["payload"]["label"] == "blue cube"
-    assert answer_has_overlay(parsed)
-
-
-def test_parse_loc_bbox_answer():
-    # <locY0><locX0><locY1><locX1> label
-    parsed = parse_vqa_answer("<loc0100><loc0080><loc0400><loc0360> yellow cube")
-    assert parsed["kind"] == "bbox"
-    assert parsed["normalized"] is True
-    det = parsed["payload"]["detections"][0]
-    x1, y1, x2, y2 = det["bbox"]
-    assert x1 < x2 and y1 < y2
-    assert det["label"] == "yellow cube"
-    assert answer_has_overlay(parsed)
-
-
-def test_parse_loc_multiple_boxes():
-    answer = "<loc0100><loc0080><loc0400><loc0360> cube ; <loc0200><loc0500><loc0600><loc0900> box"
-    parsed = parse_vqa_answer(answer)
-    assert parsed["kind"] == "bbox"
-    assert len(parsed["payload"]["detections"]) == 2
-
-
-def test_parse_loc_takes_precedence_over_json():
-    # An answer with <loc> tokens is parsed as loc even if JSON-ish.
-    assert parse_vqa_answer('{"x": <loc0001><loc0002>}')["normalized"] is True
-
-
-def test_draw_loc_overlay_denormalizes_to_pixels():
-    img = _blank((200, 100))
-    parsed = parse_vqa_answer("<loc0511><loc0511> cube")  # ~centre
-    out = draw_vqa_overlay(img, parsed)
-    assert out.size == img.size
-    assert out.tobytes() != img.tobytes()