mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
feat(annotate): write tool catalog to meta/info.json after annotation
After every ``lerobot-annotate`` run, the executor ensures ``meta/info.json["tools"]`` contains at minimum the canonical ``say`` schema, while preserving any tools the user pre-declared on the dataset. Chat-template consumers (PR 3 SmolVLA2 / Pi0.5 / dataset visualizer) read the catalog through ``LeRobotDatasetMetadata.tools`` and pass it to ``apply_chat_template(messages, tools=meta.tools, ...)``. - ``executor.py``: new ``_ensure_tools_in_info`` helper called after the parquet rewrite. Idempotent and additive — merges by ``function.name``, only writes back if the list changed. - ``writer.py``: drops the duplicated ``SAY_TOOL_SCHEMA`` / ``DEFAULT_TOOLS`` constants in favour of importing from ``lerobot.datasets.language`` (PR 1's single source of truth). Re-exported so existing imports keep working. - ``annotation_pipeline.mdx``: replace the "code constant only" note with a pointer to the new Tools doc and a description of the meta/info.json behaviour, including how to pre-declare custom tools before annotation runs. This is the storage half of the tools work; PR 3 ships the runnable implementations under ``src/lerobot/tools/`` (one file per tool, first up: ``say.py`` wired to Kyutai's pocket-tts). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,12 +21,19 @@ rewrites the data shards in place:
|
|||||||
| `vqa` (user / assistant pair) | `language_events` | Module 3 |
|
| `vqa` (user / assistant pair) | `language_events` | Module 3 |
|
||||||
|
|
||||||
The writer drops the legacy `subtask_index` column. It does **not** add a
|
The writer drops the legacy `subtask_index` column. It does **not** add a
|
||||||
`tools` column to the parquet — the `say` tool's JSON schema is fixed and
|
`tools` column to the parquet — the tool catalog lives at
|
||||||
lives as a code constant (`SAY_TOOL_SCHEMA` / `DEFAULT_TOOLS` in
|
`meta/info.json["tools"]` instead (see [Tools](./tools)). After every
|
||||||
`lerobot.annotations.steerable_pipeline.writer`), so the parquet stays
|
annotation run the pipeline ensures the canonical `say` schema is
|
||||||
small and PR 2 doesn't extend PR 1's schema. Chat-template consumers
|
present in that list, preserving any tools the user pre-declared. Chat-
|
||||||
import the constant directly (e.g.
|
template consumers read the catalog through
|
||||||
`apply_chat_template(messages, tools=DEFAULT_TOOLS)`).
|
`LeRobotDatasetMetadata.tools` and pass it to
|
||||||
|
`apply_chat_template(messages, tools=meta.tools, ...)`.
|
||||||
|
|
||||||
|
If you want to declare additional tools for a dataset before annotation
|
||||||
|
runs, edit `meta/info.json["tools"]` directly — the pipeline preserves
|
||||||
|
anything already there. Implementations of those tools live under
|
||||||
|
`src/lerobot/tools/`; one file per tool, registered via
|
||||||
|
`TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide.
|
||||||
|
|
||||||
## How to run it locally or on SLURM
|
## How to run it locally or on SLURM
|
||||||
|
|
||||||
|
|||||||
@@ -128,8 +128,56 @@ class Executor:
|
|||||||
print(f"[annotate] writing parquet shards into {root}/data/...", flush=True)
|
print(f"[annotate] writing parquet shards into {root}/data/...", flush=True)
|
||||||
written = self.writer.write_all(records, staging_dir, root)
|
written = self.writer.write_all(records, staging_dir, root)
|
||||||
print(f"[annotate] wrote {len(written)} shard(s); pipeline complete", flush=True)
|
print(f"[annotate] wrote {len(written)} shard(s); pipeline complete", flush=True)
|
||||||
|
|
||||||
|
# Persist the tool catalog to meta/info.json so chat-template
|
||||||
|
# consumers (PR 3 SmolVLA2 / Pi0.5 / dataset visualizer) can read
|
||||||
|
# it via ``LeRobotDatasetMetadata.tools`` (PR 1). Idempotent and
|
||||||
|
# additive: anything the user pre-populated is preserved; we only
|
||||||
|
# ensure the canonical ``say`` schema is present.
|
||||||
|
self._ensure_tools_in_info(root)
|
||||||
|
|
||||||
return PipelineRunSummary(phases=phases, written_paths=written, validation_report=report)
|
return PipelineRunSummary(phases=phases, written_paths=written, validation_report=report)
|
||||||
|
|
||||||
|
def _ensure_tools_in_info(self, root: Path) -> None:
|
||||||
|
"""Write ``meta/info.json["tools"]`` if missing the canonical ``say``.
|
||||||
|
|
||||||
|
Reads any user-declared tools already in ``info.json`` and merges
|
||||||
|
the canonical ``SAY_TOOL_SCHEMA`` into the list (deduped by
|
||||||
|
``function.name``). Writes back to disk only if the list
|
||||||
|
changed.
|
||||||
|
"""
|
||||||
|
import json # noqa: PLC0415
|
||||||
|
|
||||||
|
from lerobot.datasets.language import SAY_TOOL_SCHEMA # noqa: PLC0415
|
||||||
|
|
||||||
|
info_path = root / "meta" / "info.json"
|
||||||
|
if not info_path.exists():
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
info = json.loads(info_path.read_text())
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
print(f"[annotate] could not read {info_path}: {exc}", flush=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
existing = info.get("tools")
|
||||||
|
if not isinstance(existing, list):
|
||||||
|
existing = []
|
||||||
|
names = {
|
||||||
|
(t.get("function") or {}).get("name")
|
||||||
|
for t in existing
|
||||||
|
if isinstance(t, dict)
|
||||||
|
}
|
||||||
|
merged = list(existing)
|
||||||
|
if SAY_TOOL_SCHEMA["function"]["name"] not in names:
|
||||||
|
merged.append(SAY_TOOL_SCHEMA)
|
||||||
|
if merged != existing:
|
||||||
|
info["tools"] = merged
|
||||||
|
info_path.write_text(json.dumps(info, indent=2))
|
||||||
|
print(
|
||||||
|
f"[annotate] meta/info.json: tools={[t['function']['name'] for t in merged]}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
def _run_module_phase(
|
def _run_module_phase(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
|
|||||||
@@ -69,36 +69,11 @@ from .staging import EpisodeStaging
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
SAY_TOOL_SCHEMA: dict[str, Any] = {
|
# Tool schema constants moved to lerobot.datasets.language in PR 1 — single
|
||||||
"type": "function",
|
# source of truth. Re-exported here so existing imports
|
||||||
"function": {
|
# (``from lerobot.annotations.steerable_pipeline.writer import SAY_TOOL_SCHEMA``)
|
||||||
"name": "say",
|
# keep working.
|
||||||
"description": "Speak a short utterance to the user via the TTS executor.",
|
from lerobot.datasets.language import DEFAULT_TOOLS, SAY_TOOL_SCHEMA # noqa: F401, E402
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"text": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The verbatim text to speak.",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["text"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
"""Fixed JSON schema for the only tool the canonical recipe knows about.
|
|
||||||
|
|
||||||
Kept here as a code constant rather than written as a parquet column so
|
|
||||||
the v3.1 schema (PR 1) doesn't need to grow a redundant broadcast field
|
|
||||||
that holds the same value on every row of every dataset. Downstream
|
|
||||||
chat-template consumers (Pi0.5 processor, lerobot-dataset-visualizer)
|
|
||||||
import this directly. If multi-tool-set support ever becomes real, the
|
|
||||||
right place is ``meta/info.json["tools"]`` — adding it later is
|
|
||||||
non-breaking; ripping out a parquet column already shipped is not.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DEFAULT_TOOLS: list[dict[str, Any]] = [SAY_TOOL_SCHEMA]
|
|
||||||
"""Convenience list for ``apply_chat_template(messages, tools=...)``."""
|
|
||||||
|
|
||||||
|
|
||||||
def _row_persistent_sort_key(row: dict[str, Any]) -> tuple:
|
def _row_persistent_sort_key(row: dict[str, Any]) -> tuple:
|
||||||
|
|||||||
Reference in New Issue
Block a user