From 73740ecf4b0b6bddd4c07a2c9daa59d3db07b4bc Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 30 Apr 2026 18:51:38 +0200
Subject: [PATCH] feat(annotate): write tool catalog to meta/info.json after
 annotation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After every ``lerobot-annotate`` run, the executor ensures
``meta/info.json["tools"]`` contains at minimum the canonical ``say``
schema, while preserving any tools the user pre-declared on the
dataset. Chat-template consumers (PR 3 SmolVLA2 / Pi0.5 / dataset
visualizer) read the catalog through
``LeRobotDatasetMetadata.tools`` and pass it to
``apply_chat_template(messages, tools=meta.tools, ...)``.

- ``executor.py``: new ``_ensure_tools_in_info`` helper called
  after the parquet rewrite. Idempotent and additive — merges by
  ``function.name``, only writes back if the list changed.
- ``writer.py``: drops the duplicated ``SAY_TOOL_SCHEMA`` /
  ``DEFAULT_TOOLS`` constants in favour of importing from
  ``lerobot.datasets.language`` (PR 1's single source of truth).
  Re-exported so existing imports keep working.
- ``annotation_pipeline.mdx``: replace the "code constant only" note
  with a pointer to the new Tools doc and a description of the
  meta/info.json behaviour, including how to pre-declare custom
  tools before annotation runs.

This is the storage half of the tools work; PR 3 ships the runnable
implementations under ``src/lerobot/tools/`` (one file per tool,
first up: ``say.py`` wired to Kyutai's pocket-tts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           | 19 +++++---
 .../steerable_pipeline/executor.py            | 48 +++++++++++++++++++
 .../annotations/steerable_pipeline/writer.py  | 35 ++------------
 3 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 5d5ea2ef3..a3233551a 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -21,12 +21,19 @@ rewrites the data shards in place:
 | `vqa` (user / assistant pair)               | `language_events`     | Module 3 |
 
 The writer drops the legacy `subtask_index` column. It does **not** add a
-`tools` column to the parquet — the `say` tool's JSON schema is fixed and
-lives as a code constant (`SAY_TOOL_SCHEMA` / `DEFAULT_TOOLS` in
-`lerobot.annotations.steerable_pipeline.writer`), so the parquet stays
-small and PR 2 doesn't extend PR 1's schema. Chat-template consumers
-import the constant directly (e.g.
-`apply_chat_template(messages, tools=DEFAULT_TOOLS)`).
+`tools` column to the parquet — the tool catalog lives at
+`meta/info.json["tools"]` instead (see [Tools](./tools)). After every
+annotation run the pipeline ensures the canonical `say` schema is
+present in that list, preserving any tools the user pre-declared. Chat-
+template consumers read the catalog through
+`LeRobotDatasetMetadata.tools` and pass it to
+`apply_chat_template(messages, tools=meta.tools, ...)`.
+
+If you want to declare additional tools for a dataset before annotation
+runs, edit `meta/info.json["tools"]` directly — the pipeline preserves
+anything already there. Implementations of those tools live under
+`src/lerobot/tools/`; one file per tool, registered via
+`TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide.
 
 ## How to run it locally or on SLURM
 
diff --git a/src/lerobot/annotations/steerable_pipeline/executor.py b/src/lerobot/annotations/steerable_pipeline/executor.py
index b24d698d6..79a7f1614 100644
--- a/src/lerobot/annotations/steerable_pipeline/executor.py
+++ b/src/lerobot/annotations/steerable_pipeline/executor.py
@@ -128,8 +128,56 @@ class Executor:
         print(f"[annotate] writing parquet shards into {root}/data/...", flush=True)
         written = self.writer.write_all(records, staging_dir, root)
         print(f"[annotate] wrote {len(written)} shard(s); pipeline complete", flush=True)
+
+        # Persist the tool catalog to meta/info.json so chat-template
+        # consumers (PR 3 SmolVLA2 / Pi0.5 / dataset visualizer) can read
+        # it via ``LeRobotDatasetMetadata.tools`` (PR 1). Idempotent and
+        # additive: anything the user pre-populated is preserved; we only
+        # ensure the canonical ``say`` schema is present.
+        self._ensure_tools_in_info(root)
+
         return PipelineRunSummary(phases=phases, written_paths=written, validation_report=report)
 
+    def _ensure_tools_in_info(self, root: Path) -> None:
+        """Write ``meta/info.json["tools"]`` if missing the canonical ``say``.
+
+        Reads any user-declared tools already in ``info.json`` and merges
+        the canonical ``SAY_TOOL_SCHEMA`` into the list (deduped by
+        ``function.name``). Writes back to disk only if the list
+        changed.
+        """
+        import json  # noqa: PLC0415
+
+        from lerobot.datasets.language import SAY_TOOL_SCHEMA  # noqa: PLC0415
+
+        info_path = root / "meta" / "info.json"
+        if not info_path.exists():
+            return
+        try:
+            info = json.loads(info_path.read_text())
+        except Exception as exc:  # noqa: BLE001
+            print(f"[annotate] could not read {info_path}: {exc}", flush=True)
+            return
+
+        existing = info.get("tools")
+        if not isinstance(existing, list):
+            existing = []
+        names = {
+            (t.get("function") or {}).get("name")
+            for t in existing
+            if isinstance(t, dict)
+        }
+        merged = list(existing)
+        if SAY_TOOL_SCHEMA["function"]["name"] not in names:
+            merged.append(SAY_TOOL_SCHEMA)
+        if merged != existing:
+            info["tools"] = merged
+            info_path.write_text(json.dumps(info, indent=2))
+            print(
+                f"[annotate] meta/info.json: tools={[t['function']['name'] for t in merged]}",
+                flush=True,
+            )
+
     def _run_module_phase(
         self,
         name: str,
diff --git a/src/lerobot/annotations/steerable_pipeline/writer.py b/src/lerobot/annotations/steerable_pipeline/writer.py
index e595161c6..85c5aff3f 100644
--- a/src/lerobot/annotations/steerable_pipeline/writer.py
+++ b/src/lerobot/annotations/steerable_pipeline/writer.py
@@ -69,36 +69,11 @@ from .staging import EpisodeStaging
 logger = logging.getLogger(__name__)
 
 
-SAY_TOOL_SCHEMA: dict[str, Any] = {
-    "type": "function",
-    "function": {
-        "name": "say",
-        "description": "Speak a short utterance to the user via the TTS executor.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "text": {
-                    "type": "string",
-                    "description": "The verbatim text to speak.",
-                }
-            },
-            "required": ["text"],
-        },
-    },
-}
-"""Fixed JSON schema for the only tool the canonical recipe knows about.
-
-Kept here as a code constant rather than written as a parquet column so
-the v3.1 schema (PR 1) doesn't need to grow a redundant broadcast field
-that holds the same value on every row of every dataset. Downstream
-chat-template consumers (Pi0.5 processor, lerobot-dataset-visualizer)
-import this directly. If multi-tool-set support ever becomes real, the
-right place is ``meta/info.json["tools"]`` — adding it later is
-non-breaking; ripping out a parquet column already shipped is not.
-"""
-
-DEFAULT_TOOLS: list[dict[str, Any]] = [SAY_TOOL_SCHEMA]
-"""Convenience list for ``apply_chat_template(messages, tools=...)``."""
+# Tool schema constants moved to lerobot.datasets.language in PR 1 — single
+# source of truth. Re-exported here so existing imports
+# (``from lerobot.annotations.steerable_pipeline.writer import SAY_TOOL_SCHEMA``)
+# keep working.
+from lerobot.datasets.language import DEFAULT_TOOLS, SAY_TOOL_SCHEMA  # noqa: F401, E402
 
 
 def _row_persistent_sort_key(row: dict[str, Any]) -> tuple: