review: address CarolinePascal feedback

- name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-11 03:52:02 +00:00 · 2026-05-18 12:03:25 +02:00
parent 965d42825f
commit fd18beb3a1
23 changed files with 383 additions and 412 deletions
@@ -15,85 +15,24 @@
 # limitations under the License.
 """Shared fixtures for annotation-pipeline tests.

-Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests
-can exercise real parquet reads and writes without needing a checked-in
-LFS dataset.
+The on-disk dataset builder lives with the other dataset factories in
+``tests/fixtures/dataset_factories.py`` (:func:`build_annotation_dataset`);
+these fixtures only wire it into pytest.
 """

 from __future__ import annotations

-import json
 from pathlib import Path

-import pyarrow as pa
-import pyarrow.parquet as pq
 import pytest

-
-def _make_episode_table(
-    episode_index: int,
-    num_frames: int,
-    *,
-    fps: int = 10,
-    task_index: int = 0,
-) -> pa.Table:
-    timestamps = [round(i / fps, 6) for i in range(num_frames)]
-    frame_indices = list(range(num_frames))
-    return pa.Table.from_pydict(
-        {
-            "episode_index": [episode_index] * num_frames,
-            "frame_index": frame_indices,
-            "timestamp": timestamps,
-            "task_index": [task_index] * num_frames,
-            "subtask_index": [0] * num_frames,  # legacy column the writer must drop
-        }
-    )
-
-
-def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:
-    """Create a fixture dataset under ``root``.
-
-    ``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
-    Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
-    so the writer's per-shard rewrite path is exercised.
-    """
-    data_dir = root / "data" / "chunk-000"
-    data_dir.mkdir(parents=True, exist_ok=True)
-    tasks = {}
-    for episode_index, num_frames, task_text in episode_specs:
-        task_index = len(tasks)
-        if task_text not in tasks.values():
-            tasks[task_index] = task_text
-        else:
-            task_index = next(k for k, v in tasks.items() if v == task_text)
-        table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)
-        path = data_dir / f"file-{episode_index:03d}.parquet"
-        pq.write_table(table, path)
-
-    meta_dir = root / "meta"
-    meta_dir.mkdir(parents=True, exist_ok=True)
-    tasks_table = pa.Table.from_pydict(
-        {
-            "task_index": list(tasks.keys()),
-            "task": list(tasks.values()),
-        }
-    )
-    pq.write_table(tasks_table, meta_dir / "tasks.parquet")
-
-    info = {
-        "codebase_version": "v3.1",
-        "fps": fps,
-        "total_episodes": len(episode_specs),
-    }
-    (meta_dir / "info.json").write_text(json.dumps(info, indent=2))
-
-    return root
+from tests.fixtures.dataset_factories import build_annotation_dataset


@pytest.fixture
 def fixture_dataset_root(tmp_path: Path) -> Path:
    """A tiny dataset with two episodes, 12 frames each at 10 fps."""
-    return _build_dataset(
+    return build_annotation_dataset(
        tmp_path / "ds",
        episode_specs=[
            (0, 12, "Could you tidy the kitchen please?"),
@@ -105,7 +44,7 @@ def fixture_dataset_root(tmp_path: Path) -> Path:

@pytest.fixture
 def single_episode_root(tmp_path: Path) -> Path:
-    return _build_dataset(
+    return build_annotation_dataset(
        tmp_path / "ds_one",
        episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
        fps=10,