mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 02:59:50 +00:00
review: address CarolinePascal feedback
- name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vendored
+61
@@ -555,3 +555,64 @@ def lerobot_dataset_factory(
|
||||
@pytest.fixture(scope="session")
|
||||
def empty_lerobot_dataset_factory() -> LeRobotDatasetFactory:
|
||||
return partial(LeRobotDataset.create, repo_id=DUMMY_REPO_ID, fps=DEFAULT_FPS)
|
||||
|
||||
|
||||
def build_annotation_dataset(
|
||||
root: Path,
|
||||
episode_specs: list[tuple[int, int, str]],
|
||||
*,
|
||||
fps: int = 10,
|
||||
) -> Path:
|
||||
"""Build a minimal LeRobot-shaped dataset on disk for annotation tests.
|
||||
|
||||
``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
|
||||
Each episode is written to its own
|
||||
``data/chunk-000/file-{ep:03d}.parquet`` so the writer's per-shard
|
||||
rewrite path is exercised. The dataset carries the minimum
|
||||
``meta/tasks.parquet`` + ``meta/info.json`` the reader / executor need;
|
||||
it has no videos, so the modules fall back to text-only prompts.
|
||||
|
||||
Shared by the annotation-pipeline pytest fixtures (``tests/annotations/
|
||||
conftest.py``) and the opt-in E2E smoke run so the fixture shape lives
|
||||
in exactly one place.
|
||||
"""
|
||||
from lerobot.datasets.io_utils import write_tasks
|
||||
from lerobot.utils.io_utils import write_json
|
||||
|
||||
data_dir = root / "data" / "chunk-000"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tasks: dict[int, str] = {}
|
||||
for episode_index, num_frames, task_text in episode_specs:
|
||||
if task_text not in tasks.values():
|
||||
tasks[len(tasks)] = task_text
|
||||
task_index = next(k for k, v in tasks.items() if v == task_text)
|
||||
frame = pd.DataFrame(
|
||||
{
|
||||
"episode_index": [episode_index] * num_frames,
|
||||
"frame_index": list(range(num_frames)),
|
||||
"timestamp": [round(i / fps, 6) for i in range(num_frames)],
|
||||
"task_index": [task_index] * num_frames,
|
||||
"subtask_index": [0] * num_frames, # legacy column the writer must drop
|
||||
}
|
||||
)
|
||||
frame.to_parquet(data_dir / f"file-{episode_index:03d}.parquet", index=False)
|
||||
|
||||
# Canonical tasks frame: indexed by task string with a ``task_index``
|
||||
# column, matching what ``lerobot.datasets.io_utils.load_tasks`` expects.
|
||||
tasks_df = pd.DataFrame(
|
||||
{"task_index": list(tasks.keys())},
|
||||
index=pd.Index(list(tasks.values()), name="task"),
|
||||
)
|
||||
write_tasks(tasks_df, root)
|
||||
|
||||
write_json(
|
||||
{
|
||||
"codebase_version": "v3.1",
|
||||
"fps": fps,
|
||||
"features": {},
|
||||
"total_episodes": len(episode_specs),
|
||||
},
|
||||
root / "meta" / "info.json",
|
||||
)
|
||||
return root
|
||||
|
||||
Reference in New Issue
Block a user