review: address CarolinePascal feedback

- name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-07 10:01:56 +00:00 · 2026-05-18 12:03:25 +02:00
parent 965d42825f
commit fd18beb3a1
23 changed files with 383 additions and 412 deletions
@@ -15,85 +15,24 @@
 # limitations under the License.
 """Shared fixtures for annotation-pipeline tests.

-Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests
-can exercise real parquet reads and writes without needing a checked-in
-LFS dataset.
+The on-disk dataset builder lives with the other dataset factories in
+``tests/fixtures/dataset_factories.py`` (:func:`build_annotation_dataset`);
+these fixtures only wire it into pytest.
 """

 from __future__ import annotations

-import json
 from pathlib import Path

-import pyarrow as pa
-import pyarrow.parquet as pq
 import pytest

-
-def _make_episode_table(
-    episode_index: int,
-    num_frames: int,
-    *,
-    fps: int = 10,
-    task_index: int = 0,
-) -> pa.Table:
-    timestamps = [round(i / fps, 6) for i in range(num_frames)]
-    frame_indices = list(range(num_frames))
-    return pa.Table.from_pydict(
-        {
-            "episode_index": [episode_index] * num_frames,
-            "frame_index": frame_indices,
-            "timestamp": timestamps,
-            "task_index": [task_index] * num_frames,
-            "subtask_index": [0] * num_frames,  # legacy column the writer must drop
-        }
-    )
-
-
-def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:
-    """Create a fixture dataset under ``root``.
-
-    ``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
-    Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
-    so the writer's per-shard rewrite path is exercised.
-    """
-    data_dir = root / "data" / "chunk-000"
-    data_dir.mkdir(parents=True, exist_ok=True)
-    tasks = {}
-    for episode_index, num_frames, task_text in episode_specs:
-        task_index = len(tasks)
-        if task_text not in tasks.values():
-            tasks[task_index] = task_text
-        else:
-            task_index = next(k for k, v in tasks.items() if v == task_text)
-        table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)
-        path = data_dir / f"file-{episode_index:03d}.parquet"
-        pq.write_table(table, path)
-
-    meta_dir = root / "meta"
-    meta_dir.mkdir(parents=True, exist_ok=True)
-    tasks_table = pa.Table.from_pydict(
-        {
-            "task_index": list(tasks.keys()),
-            "task": list(tasks.values()),
-        }
-    )
-    pq.write_table(tasks_table, meta_dir / "tasks.parquet")
-
-    info = {
-        "codebase_version": "v3.1",
-        "fps": fps,
-        "total_episodes": len(episode_specs),
-    }
-    (meta_dir / "info.json").write_text(json.dumps(info, indent=2))
-
-    return root
+from tests.fixtures.dataset_factories import build_annotation_dataset


@pytest.fixture
 def fixture_dataset_root(tmp_path: Path) -> Path:
    """A tiny dataset with two episodes, 12 frames each at 10 fps."""
-    return _build_dataset(
+    return build_annotation_dataset(
        tmp_path / "ds",
        episode_specs=[
            (0, 12, "Could you tidy the kitchen please?"),
@@ -105,7 +44,7 @@ def fixture_dataset_root(tmp_path: Path) -> Path:

@pytest.fixture
 def single_episode_root(tmp_path: Path) -> Path:
-    return _build_dataset(
+    return build_annotation_dataset(
        tmp_path / "ds_one",
        episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
        fps=10,
@@ -15,22 +15,19 @@
 # limitations under the License.
 """Opt-in E2E smoke run for ``make annotation-e2e``.

-Builds the same fixture used by the pytest suite, runs the full
-annotation pipeline against it with a stub VLM, and prints a short report.
-This is intentionally not a pytest test — it exercises the CLI plumbing
-without depending on conftest.py fixtures.
+Builds the shared annotation fixture (:func:`build_annotation_dataset`),
+runs the full annotation pipeline against it with a stub VLM, and prints a
+short report. This is intentionally not a pytest test — it exercises the
+CLI plumbing — but it reuses the same on-disk dataset builder as the pytest
+fixtures so there is no duplicated fixture code.
 """

 from __future__ import annotations

-import json
 import sys
 import tempfile
 from pathlib import Path

-import pyarrow as pa
-import pyarrow.parquet as pq
-
 from lerobot.annotations.steerable_pipeline.config import AnnotationPipelineConfig
 from lerobot.annotations.steerable_pipeline.executor import Executor
 from lerobot.annotations.steerable_pipeline.modules import (
@@ -41,31 +38,7 @@ from lerobot.annotations.steerable_pipeline.modules import (
 from lerobot.annotations.steerable_pipeline.validator import StagingValidator
 from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
 from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
-
-
-def _build_dataset(root: Path) -> Path:
-    data_dir = root / "data" / "chunk-000"
-    data_dir.mkdir(parents=True, exist_ok=True)
-    n = 30
-    timestamps = [round(i / 10, 6) for i in range(n)]
-    table = pa.Table.from_pydict(
-        {
-            "episode_index": [0] * n,
-            "frame_index": list(range(n)),
-            "timestamp": timestamps,
-            "task_index": [0] * n,
-            "subtask_index": [0] * n,
-        }
-    )
-    pq.write_table(table, data_dir / "file-000.parquet")
-    meta = root / "meta"
-    meta.mkdir(parents=True, exist_ok=True)
-    pq.write_table(
-        pa.Table.from_pydict({"task_index": [0], "task": ["Pour water into the cup."]}),
-        meta / "tasks.parquet",
-    )
-    (meta / "info.json").write_text(json.dumps({"codebase_version": "v3.1", "fps": 10}))
-    return root
+from tests.fixtures.dataset_factories import build_annotation_dataset


 def _stub_responder(messages):
@@ -102,14 +75,18 @@ def _stub_responder(messages):

 def main() -> int:
    with tempfile.TemporaryDirectory() as tmp:
-        root = _build_dataset(Path(tmp) / "ds")
+        root = build_annotation_dataset(
+            Path(tmp) / "ds",
+            episode_specs=[(0, 30, "Pour water into the cup.")],
+            fps=10,
+        )
        vlm = StubVlmClient(responder=_stub_responder)
        cfg = AnnotationPipelineConfig()
        executor = Executor(
            config=cfg,
-            module_1=PlanSubtasksMemoryModule(vlm=vlm, config=cfg.module_1),
-            module_2=InterjectionsAndSpeechModule(vlm=vlm, config=cfg.module_2, seed=cfg.seed),
-            module_3=GeneralVqaModule(vlm=vlm, config=cfg.module_3, seed=cfg.seed),
+            plan=PlanSubtasksMemoryModule(vlm=vlm, config=cfg.plan),
+            interjections=InterjectionsAndSpeechModule(vlm=vlm, config=cfg.interjections, seed=cfg.seed),
+            vqa=GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed),
            writer=LanguageColumnsWriter(),
            validator=StagingValidator(),
        )
@@ -23,9 +23,9 @@ from pathlib import Path
 from typing import Any

 from lerobot.annotations.steerable_pipeline.config import (
-    Module1Config,
-    Module2Config,
-    Module3Config,
+    InterjectionsConfig,
+    PlanConfig,
+    VqaConfig,
 )
 from lerobot.annotations.steerable_pipeline.modules import (
    GeneralVqaModule,
@@ -84,11 +84,11 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
            "Update the memory": {"memory": "wiped the counter once"},
        },
    )
-    module = PlanSubtasksMemoryModule(vlm=vlm, config=Module1Config())
+    module = PlanSubtasksMemoryModule(vlm=vlm, config=PlanConfig())
    record = next(iter_episodes(fixture_dataset_root))
    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
    module.run_episode(record, staging)
-    rows = staging.read("module_1")
+    rows = staging.read("plan")

    styles = {r["style"] for r in rows}
    assert {"subtask", "plan", "memory"}.issubset(styles)
@@ -108,12 +108,12 @@ def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: P
    )
    module = InterjectionsAndSpeechModule(
        vlm=vlm,
-        config=Module2Config(max_interjections_per_episode=0),
+        config=InterjectionsConfig(max_interjections_per_episode=0),
    )
    record = next(iter_episodes(fixture_dataset_root))
    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
    module.run_episode(record, staging)
-    rows = staging.read("module_2")
+    rows = staging.read("interjections")
    assert len(rows) == 1
    only = rows[0]
    assert only["role"] == "assistant"
@@ -151,7 +151,7 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
    )
    module = InterjectionsAndSpeechModule(
        vlm=vlm,
-        config=Module2Config(max_interjections_per_episode=1, interjection_min_t=0.2),
+        config=InterjectionsConfig(max_interjections_per_episode=1, interjection_min_t=0.2),
        seed=7,
    )
    record = next(iter_episodes(fixture_dataset_root))
@@ -161,7 +161,7 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
    # production executor guarantees Module 1 ran first).
    boundary_ts = float(record.frame_timestamps[len(record.frame_timestamps) // 2])
    staging.write(
-        "module_1",
+        "plan",
        [
            {
                "role": "assistant",
@@ -180,7 +180,7 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
        ],
    )
    module.run_episode(record, staging)
-    rows = staging.read("module_2")
+    rows = staging.read("interjections")

    interjections = [r for r in rows if r["style"] == "interjection"]
    speeches = [r for r in rows if r["style"] is None and r["role"] == "assistant"]
@@ -198,14 +198,14 @@ def test_module3_vqa_unique_per_frame_and_camera(single_episode_root: Path, tmp_
    vlm = make_canned_responder({"frame-grounded visual question": payload})
    module = GeneralVqaModule(
        vlm=vlm,
-        config=Module3Config(vqa_emission_hz=1.0, K=3),
+        config=VqaConfig(vqa_emission_hz=1.0, K=3),
        seed=1,
        frame_provider=_StubFrameProvider(cameras=("observation.images.top", "observation.images.wrist")),
    )
    record = next(iter_episodes(single_episode_root))
    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
    module.run_episode(record, staging)
-    rows = staging.read("module_3")
+    rows = staging.read("vqa")
    # every vqa row must carry a camera tag and one of the configured cameras
    for r in rows:
        assert r["style"] == "vqa"
@@ -257,7 +257,7 @@ def test_module1_attaches_video_block_to_subtask_prompt(fixture_dataset_root: Pa
        # call is the subtask one — keeps the assertions below focused on
        # ``_generate_subtasks`` rather than fighting the order of unrelated
        # text-only Module-1 sub-prompts.
-        config=Module1Config(max_video_frames=5, frames_per_second=10.0, n_task_rephrasings=0),
+        config=PlanConfig(max_video_frames=5, frames_per_second=10.0, n_task_rephrasings=0),
        frame_provider=provider,
    )
    record = next(iter_episodes(fixture_dataset_root))
@@ -304,7 +304,7 @@ def test_module3_attaches_frame_image_block_to_prompt(single_episode_root: Path,
    provider = _StubFrameProvider()
    module = GeneralVqaModule(
        vlm=_spy_responder(captured, payload),
-        config=Module3Config(vqa_emission_hz=1.0, K=1),
+        config=VqaConfig(vqa_emission_hz=1.0, K=1),
        seed=0,
        frame_provider=provider,
    )
@@ -336,14 +336,14 @@ def test_module3_assistant_content_is_valid_json(single_episode_root: Path, tmp_
    vlm = make_canned_responder({"frame-grounded visual question": payload})
    module = GeneralVqaModule(
        vlm=vlm,
-        config=Module3Config(vqa_emission_hz=1.0, K=2),
+        config=VqaConfig(vqa_emission_hz=1.0, K=2),
        seed=2,
        frame_provider=_StubFrameProvider(),
    )
    record = next(iter_episodes(single_episode_root))
    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
    module.run_episode(record, staging)
-    rows = staging.read("module_3")
+    rows = staging.read("vqa")
    for row in rows:
        if row["role"] == "assistant" and row["style"] == "vqa":
            decoded = json.loads(row["content"])
@@ -23,9 +23,9 @@ import pyarrow.parquet as pq

 from lerobot.annotations.steerable_pipeline.config import (
    AnnotationPipelineConfig,
-    Module1Config,
-    Module2Config,
-    Module3Config,
+    InterjectionsConfig,
+    PlanConfig,
+    VqaConfig,
 )
 from lerobot.annotations.steerable_pipeline.executor import Executor
 from lerobot.annotations.steerable_pipeline.modules import (
@@ -115,15 +115,15 @@ def _build_executor() -> Executor:
        },
    )
    config = AnnotationPipelineConfig(
-        module_1=Module1Config(),
-        module_2=Module2Config(max_interjections_per_episode=1, interjection_min_t=0.5),
-        module_3=Module3Config(vqa_emission_hz=1.0, K=2),
+        plan=PlanConfig(),
+        interjections=InterjectionsConfig(max_interjections_per_episode=1, interjection_min_t=0.5),
+        vqa=VqaConfig(vqa_emission_hz=1.0, K=2),
    )
    return Executor(
        config=config,
-        module_1=PlanSubtasksMemoryModule(vlm=vlm, config=config.module_1),
-        module_2=InterjectionsAndSpeechModule(vlm=vlm, config=config.module_2, seed=config.seed),
-        module_3=GeneralVqaModule(vlm=vlm, config=config.module_3, seed=config.seed),
+        plan=PlanSubtasksMemoryModule(vlm=vlm, config=config.plan),
+        interjections=InterjectionsAndSpeechModule(vlm=vlm, config=config.interjections, seed=config.seed),
+        vqa=GeneralVqaModule(vlm=vlm, config=config.vqa, seed=config.seed),
        writer=LanguageColumnsWriter(),
        validator=StagingValidator(),
    )
@@ -34,7 +34,7 @@ def _validate(root: Path, staging_dir: Path):
 def test_validator_catches_misaligned_timestamps(fixture_dataset_root: Path, tmp_path: Path) -> None:
    staging_dir = tmp_path / "stage"
    EpisodeStaging(staging_dir, 0).write(
-        "module_3",
+        "vqa",
        [
            {
                "role": "assistant",
@@ -53,7 +53,7 @@ def test_validator_catches_misaligned_timestamps(fixture_dataset_root: Path, tmp
 def test_validator_catches_orphan_speech(fixture_dataset_root: Path, tmp_path: Path) -> None:
    staging_dir = tmp_path / "stage"
    EpisodeStaging(staging_dir, 0).write(
-        "module_2",
+        "interjections",
        [
            speech_atom(0.0, "Got it."),
            # interjection at 0.3s with NO paired speech
@@ -74,7 +74,7 @@ def test_validator_catches_orphan_speech(fixture_dataset_root: Path, tmp_path: P
 def test_validator_catches_inconsistent_plan_memory(fixture_dataset_root: Path, tmp_path: Path) -> None:
    staging_dir = tmp_path / "stage"
    EpisodeStaging(staging_dir, 0).write(
-        "module_1",
+        "plan",
        [
            {
                "role": "assistant",
@@ -93,7 +93,7 @@ def test_validator_catches_inconsistent_plan_memory(fixture_dataset_root: Path,
        ],
    )
    EpisodeStaging(staging_dir, 0).write(
-        "module_2",
+        "interjections",
        [
            speech_atom(0.0, "Got it."),
            speech_atom(0.4, "Replanning."),
@@ -115,11 +115,11 @@ def test_validator_catches_inconsistent_plan_memory(fixture_dataset_root: Path,
 def test_validator_catches_wrong_column(fixture_dataset_root: Path, tmp_path: Path) -> None:
    staging_dir = tmp_path / "stage"
    EpisodeStaging(staging_dir, 0).write(
-        "module_1",
+        "plan",
        [
            {"role": "user", "content": "where?", "style": "vqa", "timestamp": 0.0, "tool_calls": None},
        ],
    )
    report = _validate(fixture_dataset_root, staging_dir)
    assert not report.ok
-    assert any("module_1 emitted style 'vqa'" in e or "must be persistent" in e for e in report.errors)
+    assert any("plan emitted style 'vqa'" in e or "must be persistent" in e for e in report.errors)
@@ -35,17 +35,17 @@ def _stage_episode(
    staging_dir: Path,
    episode_index: int,
    *,
-    module_1: list[dict] | None = None,
-    module_2: list[dict] | None = None,
-    module_3: list[dict] | None = None,
+    plan: list[dict] | None = None,
+    interjections: list[dict] | None = None,
+    vqa: list[dict] | None = None,
 ) -> None:
    staging = EpisodeStaging(staging_dir, episode_index)
-    if module_1 is not None:
-        staging.write("module_1", module_1)
-    if module_2 is not None:
-        staging.write("module_2", module_2)
-    if module_3 is not None:
-        staging.write("module_3", module_3)
+    if plan is not None:
+        staging.write("plan", plan)
+    if interjections is not None:
+        staging.write("interjections", interjections)
+    if vqa is not None:
+        staging.write("vqa", vqa)


 def test_writer_persistence_identity(fixture_dataset_root: Path, tmp_path: Path) -> None:
@@ -54,7 +54,7 @@ def test_writer_persistence_identity(fixture_dataset_root: Path, tmp_path: Path)
    _stage_episode(
        staging_dir,
        0,
-        module_1=[
+        plan=[
            {
                "role": "assistant",
                "content": "grasp the sponge",
@@ -94,7 +94,7 @@ def test_writer_events_exact_timestamp(fixture_dataset_root: Path, tmp_path: Pat
    _stage_episode(
        staging_dir,
        0,
-        module_2=[
+        interjections=[
            speech_atom(0.0, "Got it."),
            {
                "role": "user",
@@ -127,7 +127,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
    _stage_episode(
        staging_dir,
        0,
-        module_1=[
+        plan=[
            {
                "role": "assistant",
                "content": "do X",
@@ -150,7 +150,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
                "tool_calls": None,
            },
        ],
-        module_2=[
+        interjections=[
            speech_atom(0.0, "OK"),
            {
                "role": "user",
@@ -161,7 +161,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
            },
            speech_atom(0.2, "Waiting"),
        ],
-        module_3=[
+        vqa=[
            {
                "role": "user",
                "content": "where is the cup?",
@@ -201,7 +201,7 @@ def test_writer_drops_subtask_index_idempotent(fixture_dataset_root: Path, tmp_p
    _stage_episode(
        staging_dir,
        0,
-        module_1=[
+        plan=[
            {
                "role": "assistant",
                "content": "do X",
@@ -277,7 +277,7 @@ def test_writer_does_not_add_tools_column(fixture_dataset_root: Path, tmp_path:
    _stage_episode(
        staging_dir,
        0,
-        module_1=[
+        plan=[
            {"role": "assistant", "content": "x", "style": "subtask", "timestamp": 0.0, "tool_calls": None}
        ],
    )
@@ -316,7 +316,7 @@ def test_annotation_metadata_sync_allows_non_streaming_load(
    _stage_episode(
        staging_dir,
        0,
-        module_1=[
+        plan=[
            {"role": "assistant", "content": "do X", "style": "subtask", "timestamp": 0.0, "tool_calls": None}
        ],
    )