feat: language annotation pipeline (PR 2/3)

Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-*/file-*.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 17:20:05 +00:00 · 2026-04-27 16:22:51 +02:00
parent 1ca38d9748
commit 785cee429e
33 changed files with 3409 additions and 0 deletions
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared fixtures for annotation-pipeline tests.
+
+Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests
+can exercise real parquet reads and writes without needing a checked-in
+LFS dataset.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+
+
+def _make_episode_table(
+    episode_index: int,
+    num_frames: int,
+    *,
+    fps: int = 10,
+    task_index: int = 0,
+) -> pa.Table:
+    timestamps = [round(i / fps, 6) for i in range(num_frames)]
+    frame_indices = list(range(num_frames))
+    return pa.Table.from_pydict(
+        {
+            "episode_index": [episode_index] * num_frames,
+            "frame_index": frame_indices,
+            "timestamp": timestamps,
+            "task_index": [task_index] * num_frames,
+            "subtask_index": [0] * num_frames,  # legacy column the writer must drop
+        }
+    )
+
+
+def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:
+    """Create a fixture dataset under ``root``.
+
+    ``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
+    Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
+    so the writer's per-shard rewrite path is exercised.
+    """
+    data_dir = root / "data" / "chunk-000"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    tasks = {}
+    for episode_index, num_frames, task_text in episode_specs:
+        task_index = len(tasks)
+        if task_text not in tasks.values():
+            tasks[task_index] = task_text
+        else:
+            task_index = next(k for k, v in tasks.items() if v == task_text)
+        table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)
+        path = data_dir / f"file-{episode_index:03d}.parquet"
+        pq.write_table(table, path)
+
+    meta_dir = root / "meta"
+    meta_dir.mkdir(parents=True, exist_ok=True)
+    tasks_table = pa.Table.from_pydict(
+        {
+            "task_index": list(tasks.keys()),
+            "task": list(tasks.values()),
+        }
+    )
+    pq.write_table(tasks_table, meta_dir / "tasks.parquet")
+
+    info = {
+        "codebase_version": "v3.1",
+        "fps": fps,
+        "total_episodes": len(episode_specs),
+    }
+    (meta_dir / "info.json").write_text(json.dumps(info, indent=2))
+
+    return root
+
+
+@pytest.fixture
+def fixture_dataset_root(tmp_path: Path) -> Path:
+    """A tiny dataset with two episodes, 12 frames each at 10 fps."""
+    return _build_dataset(
+        tmp_path / "ds",
+        episode_specs=[
+            (0, 12, "Could you tidy the kitchen please?"),
+            (1, 12, "Please clean up the kitchen"),
+        ],
+        fps=10,
+    )
+
+
+@pytest.fixture
+def single_episode_root(tmp_path: Path) -> Path:
+    return _build_dataset(
+        tmp_path / "ds_one",
+        episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
+        fps=10,
+    )