From 73740ecf4b0b6bddd4c07a2c9daa59d3db07b4bc Mon Sep 17 00:00:00 2001 From: Pepijn Date: Thu, 30 Apr 2026 18:51:38 +0200 Subject: [PATCH] feat(annotate): write tool catalog to meta/info.json after annotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After every ``lerobot-annotate`` run, the executor ensures ``meta/info.json["tools"]`` contains at minimum the canonical ``say`` schema, while preserving any tools the user pre-declared on the dataset. Chat-template consumers (PR 3 SmolVLA2 / Pi0.5 / dataset visualizer) read the catalog through ``LeRobotDatasetMetadata.tools`` and pass it to ``apply_chat_template(messages, tools=meta.tools, ...)``. - ``executor.py``: new ``_ensure_tools_in_info`` helper called after the parquet rewrite. Idempotent and additive — merges by ``function.name``, only writes back if the list changed. - ``writer.py``: drops the duplicated ``SAY_TOOL_SCHEMA`` / ``DEFAULT_TOOLS`` constants in favour of importing from ``lerobot.datasets.language`` (PR 1's single source of truth). Re-exported so existing imports keep working. - ``annotation_pipeline.mdx``: replace the "code constant only" note with a pointer to the new Tools doc and a description of the meta/info.json behaviour, including how to pre-declare custom tools before annotation runs. This is the storage half of the tools work; PR 3 ships the runnable implementations under ``src/lerobot/tools/`` (one file per tool, first up: ``say.py`` wired to Kyutai's pocket-tts). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/source/annotation_pipeline.mdx | 19 +++++--- .../steerable_pipeline/executor.py | 48 +++++++++++++++++++ .../annotations/steerable_pipeline/writer.py | 35 ++------------ 3 files changed, 66 insertions(+), 36 deletions(-) diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx index 5d5ea2ef3..a3233551a 100644 --- a/docs/source/annotation_pipeline.mdx +++ b/docs/source/annotation_pipeline.mdx @@ -21,12 +21,19 @@ rewrites the data shards in place: | `vqa` (user / assistant pair) | `language_events` | Module 3 | The writer drops the legacy `subtask_index` column. It does **not** add a -`tools` column to the parquet — the `say` tool's JSON schema is fixed and -lives as a code constant (`SAY_TOOL_SCHEMA` / `DEFAULT_TOOLS` in -`lerobot.annotations.steerable_pipeline.writer`), so the parquet stays -small and PR 2 doesn't extend PR 1's schema. Chat-template consumers -import the constant directly (e.g. -`apply_chat_template(messages, tools=DEFAULT_TOOLS)`). +`tools` column to the parquet — the tool catalog lives at +`meta/info.json["tools"]` instead (see [Tools](./tools)). After every +annotation run the pipeline ensures the canonical `say` schema is +present in that list, preserving any tools the user pre-declared. Chat- +template consumers read the catalog through +`LeRobotDatasetMetadata.tools` and pass it to +`apply_chat_template(messages, tools=meta.tools, ...)`. + +If you want to declare additional tools for a dataset before annotation +runs, edit `meta/info.json["tools"]` directly — the pipeline preserves +anything already there. Implementations of those tools live under +`src/lerobot/tools/`; one file per tool, registered via +`TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide. ## How to run it locally or on SLURM diff --git a/src/lerobot/annotations/steerable_pipeline/executor.py b/src/lerobot/annotations/steerable_pipeline/executor.py index b24d698d6..79a7f1614 100644 --- a/src/lerobot/annotations/steerable_pipeline/executor.py +++ b/src/lerobot/annotations/steerable_pipeline/executor.py @@ -128,8 +128,56 @@ class Executor: print(f"[annotate] writing parquet shards into {root}/data/...", flush=True) written = self.writer.write_all(records, staging_dir, root) print(f"[annotate] wrote {len(written)} shard(s); pipeline complete", flush=True) + + # Persist the tool catalog to meta/info.json so chat-template + # consumers (PR 3 SmolVLA2 / Pi0.5 / dataset visualizer) can read + # it via ``LeRobotDatasetMetadata.tools`` (PR 1). Idempotent and + # additive: anything the user pre-populated is preserved; we only + # ensure the canonical ``say`` schema is present. + self._ensure_tools_in_info(root) + return PipelineRunSummary(phases=phases, written_paths=written, validation_report=report) + def _ensure_tools_in_info(self, root: Path) -> None: + """Write ``meta/info.json["tools"]`` if missing the canonical ``say``. + + Reads any user-declared tools already in ``info.json`` and merges + the canonical ``SAY_TOOL_SCHEMA`` into the list (deduped by + ``function.name``). Writes back to disk only if the list + changed. + """ + import json # noqa: PLC0415 + + from lerobot.datasets.language import SAY_TOOL_SCHEMA # noqa: PLC0415 + + info_path = root / "meta" / "info.json" + if not info_path.exists(): + return + try: + info = json.loads(info_path.read_text()) + except Exception as exc: # noqa: BLE001 + print(f"[annotate] could not read {info_path}: {exc}", flush=True) + return + + existing = info.get("tools") + if not isinstance(existing, list): + existing = [] + names = { + (t.get("function") or {}).get("name") + for t in existing + if isinstance(t, dict) + } + merged = list(existing) + if SAY_TOOL_SCHEMA["function"]["name"] not in names: + merged.append(SAY_TOOL_SCHEMA) + if merged != existing: + info["tools"] = merged + info_path.write_text(json.dumps(info, indent=2)) + print( + f"[annotate] meta/info.json: tools={[t['function']['name'] for t in merged]}", + flush=True, + ) + def _run_module_phase( self, name: str, diff --git a/src/lerobot/annotations/steerable_pipeline/writer.py b/src/lerobot/annotations/steerable_pipeline/writer.py index e595161c6..85c5aff3f 100644 --- a/src/lerobot/annotations/steerable_pipeline/writer.py +++ b/src/lerobot/annotations/steerable_pipeline/writer.py @@ -69,36 +69,11 @@ from .staging import EpisodeStaging logger = logging.getLogger(__name__) -SAY_TOOL_SCHEMA: dict[str, Any] = { - "type": "function", - "function": { - "name": "say", - "description": "Speak a short utterance to the user via the TTS executor.", - "parameters": { - "type": "object", - "properties": { - "text": { - "type": "string", - "description": "The verbatim text to speak.", - } - }, - "required": ["text"], - }, - }, -} -"""Fixed JSON schema for the only tool the canonical recipe knows about. - -Kept here as a code constant rather than written as a parquet column so -the v3.1 schema (PR 1) doesn't need to grow a redundant broadcast field -that holds the same value on every row of every dataset. Downstream -chat-template consumers (Pi0.5 processor, lerobot-dataset-visualizer) -import this directly. If multi-tool-set support ever becomes real, the -right place is ``meta/info.json["tools"]`` — adding it later is -non-breaking; ripping out a parquet column already shipped is not. -""" - -DEFAULT_TOOLS: list[dict[str, Any]] = [SAY_TOOL_SCHEMA] -"""Convenience list for ``apply_chat_template(messages, tools=...)``.""" +# Tool schema constants moved to lerobot.datasets.language in PR 1 — single +# source of truth. Re-exported here so existing imports +# (``from lerobot.annotations.steerable_pipeline.writer import SAY_TOOL_SCHEMA``) +# keep working. +from lerobot.datasets.language import DEFAULT_TOOLS, SAY_TOOL_SCHEMA # noqa: F401, E402 def _row_persistent_sort_key(row: dict[str, Any]) -> tuple: