mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-19 10:40:04 +00:00
review: address CarolinePascal feedback
- name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,85 +15,24 @@
|
||||
# limitations under the License.
|
||||
"""Shared fixtures for annotation-pipeline tests.
|
||||
|
||||
Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests
|
||||
can exercise real parquet reads and writes without needing a checked-in
|
||||
LFS dataset.
|
||||
The on-disk dataset builder lives with the other dataset factories in
|
||||
``tests/fixtures/dataset_factories.py`` (:func:`build_annotation_dataset`);
|
||||
these fixtures only wire it into pytest.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pytest
|
||||
|
||||
|
||||
def _make_episode_table(
|
||||
episode_index: int,
|
||||
num_frames: int,
|
||||
*,
|
||||
fps: int = 10,
|
||||
task_index: int = 0,
|
||||
) -> pa.Table:
|
||||
timestamps = [round(i / fps, 6) for i in range(num_frames)]
|
||||
frame_indices = list(range(num_frames))
|
||||
return pa.Table.from_pydict(
|
||||
{
|
||||
"episode_index": [episode_index] * num_frames,
|
||||
"frame_index": frame_indices,
|
||||
"timestamp": timestamps,
|
||||
"task_index": [task_index] * num_frames,
|
||||
"subtask_index": [0] * num_frames, # legacy column the writer must drop
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:
|
||||
"""Create a fixture dataset under ``root``.
|
||||
|
||||
``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
|
||||
Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
|
||||
so the writer's per-shard rewrite path is exercised.
|
||||
"""
|
||||
data_dir = root / "data" / "chunk-000"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
tasks = {}
|
||||
for episode_index, num_frames, task_text in episode_specs:
|
||||
task_index = len(tasks)
|
||||
if task_text not in tasks.values():
|
||||
tasks[task_index] = task_text
|
||||
else:
|
||||
task_index = next(k for k, v in tasks.items() if v == task_text)
|
||||
table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)
|
||||
path = data_dir / f"file-{episode_index:03d}.parquet"
|
||||
pq.write_table(table, path)
|
||||
|
||||
meta_dir = root / "meta"
|
||||
meta_dir.mkdir(parents=True, exist_ok=True)
|
||||
tasks_table = pa.Table.from_pydict(
|
||||
{
|
||||
"task_index": list(tasks.keys()),
|
||||
"task": list(tasks.values()),
|
||||
}
|
||||
)
|
||||
pq.write_table(tasks_table, meta_dir / "tasks.parquet")
|
||||
|
||||
info = {
|
||||
"codebase_version": "v3.1",
|
||||
"fps": fps,
|
||||
"total_episodes": len(episode_specs),
|
||||
}
|
||||
(meta_dir / "info.json").write_text(json.dumps(info, indent=2))
|
||||
|
||||
return root
|
||||
from tests.fixtures.dataset_factories import build_annotation_dataset
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fixture_dataset_root(tmp_path: Path) -> Path:
|
||||
"""A tiny dataset with two episodes, 12 frames each at 10 fps."""
|
||||
return _build_dataset(
|
||||
return build_annotation_dataset(
|
||||
tmp_path / "ds",
|
||||
episode_specs=[
|
||||
(0, 12, "Could you tidy the kitchen please?"),
|
||||
@@ -105,7 +44,7 @@ def fixture_dataset_root(tmp_path: Path) -> Path:
|
||||
|
||||
@pytest.fixture
|
||||
def single_episode_root(tmp_path: Path) -> Path:
|
||||
return _build_dataset(
|
||||
return build_annotation_dataset(
|
||||
tmp_path / "ds_one",
|
||||
episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
|
||||
fps=10,
|
||||
|
||||
Reference in New Issue
Block a user