#!/usr/bin/env python # Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Shared fixtures for annotation-pipeline tests. Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests can exercise real parquet reads and writes without needing a checked-in LFS dataset. """ from __future__ import annotations import json from pathlib import Path import pyarrow as pa import pyarrow.parquet as pq import pytest def _make_episode_table( episode_index: int, num_frames: int, *, fps: int = 10, task_index: int = 0, ) -> pa.Table: timestamps = [round(i / fps, 6) for i in range(num_frames)] frame_indices = list(range(num_frames)) return pa.Table.from_pydict( { "episode_index": [episode_index] * num_frames, "frame_index": frame_indices, "timestamp": timestamps, "task_index": [task_index] * num_frames, "subtask_index": [0] * num_frames, # legacy column the writer must drop } ) def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path: """Create a fixture dataset under ``root``. ``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``. Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet`` so the writer's per-shard rewrite path is exercised. """ data_dir = root / "data" / "chunk-000" data_dir.mkdir(parents=True, exist_ok=True) tasks = {} for episode_index, num_frames, task_text in episode_specs: task_index = len(tasks) if task_text not in tasks.values(): tasks[task_index] = task_text else: task_index = next(k for k, v in tasks.items() if v == task_text) table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index) path = data_dir / f"file-{episode_index:03d}.parquet" pq.write_table(table, path) meta_dir = root / "meta" meta_dir.mkdir(parents=True, exist_ok=True) tasks_table = pa.Table.from_pydict( { "task_index": list(tasks.keys()), "task": list(tasks.values()), } ) pq.write_table(tasks_table, meta_dir / "tasks.parquet") info = { "codebase_version": "v3.1", "fps": fps, "total_episodes": len(episode_specs), } (meta_dir / "info.json").write_text(json.dumps(info, indent=2)) return root @pytest.fixture def fixture_dataset_root(tmp_path: Path) -> Path: """A tiny dataset with two episodes, 12 frames each at 10 fps.""" return _build_dataset( tmp_path / "ds", episode_specs=[ (0, 12, "Could you tidy the kitchen please?"), (1, 12, "Please clean up the kitchen"), ], fps=10, ) @pytest.fixture def single_episode_root(tmp_path: Path) -> Path: return _build_dataset( tmp_path / "ds_one", episode_specs=[(0, 30, "Pour water from the bottle into the cup.")], fps=10, )