From c450298147c1b81bd80950a097cc1c182f087a25 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 6 May 2026 12:10:41 +0200 Subject: [PATCH] Apply ruff and prettier formatting after merge Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/source/language_and_recipes.mdx | 8 +++++- docs/source/tools.mdx | 35 +++++++++++++++++--------- tests/datasets/test_language_render.py | 28 ++++++++++----------- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/docs/source/language_and_recipes.mdx b/docs/source/language_and_recipes.mdx index cb58c516a..82401c8f6 100644 --- a/docs/source/language_and_recipes.mdx +++ b/docs/source/language_and_recipes.mdx @@ -100,7 +100,13 @@ ask_vqa_top: content: - { type: image, feature: observation.images.top } - { type: text, text: "${vqa_query}" } - - { role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa } + - { + role: assistant, + content: "${vqa}", + stream: high_level, + target: true, + if_present: vqa, + } ``` Add one such sub-recipe per camera the dataset records. diff --git a/docs/source/tools.mdx b/docs/source/tools.mdx index 7dc2500f1..3309be8cd 100644 --- a/docs/source/tools.mdx +++ b/docs/source/tools.mdx @@ -29,7 +29,10 @@ Two layers. "parameters": { "type": "object", "properties": { - "text": { "type": "string", "description": "The verbatim text to speak." } + "text": { + "type": "string", + "description": "The verbatim text to speak." + } }, "required": ["text"] } @@ -67,9 +70,9 @@ prompt_str = tokenizer.apply_chat_template( `src/lerobot/tools/`, one file per tool. The canonical `say` implementation wraps Kyutai's pocket-tts model. -## Per-row tool *invocations* +## Per-row tool _invocations_ -The catalog above describes *what can be called*. The actual *call* — the +The catalog above describes _what can be called_. The actual _call_ — the function name plus the argument values — is stored per-row, on the assistant atoms in `language_events`: @@ -94,13 +97,18 @@ user_interjection_response: bindings: speech: "emitted_at(t, role=assistant, tool_name=say)" messages: - - { role: user, content: "${task}", stream: high_level } - - { role: assistant, content: "${current_plan}", stream: high_level, - target: true, tool_calls_from: speech } + - { role: user, content: "${task}", stream: high_level } + - { + role: assistant, + content: "${current_plan}", + stream: high_level, + target: true, + tool_calls_from: speech, + } ``` The model's training target is one assistant turn that carries both the -plan text *and* the `say` tool call. At inference, the runtime parses +plan text _and_ the `say` tool call. At inference, the runtime parses the generated text back into structured `tool_calls` and dispatches to the matching implementation. @@ -113,7 +121,7 @@ loop. ### Step 1 — declare the schema Add an entry under `meta/info.json["tools"]`. Either edit the file -directly on disk *before* running the annotation pipeline (it'll be +directly on disk _before_ running the annotation pipeline (it'll be preserved) or hand it to `lerobot-annotate` via a config flag. ```json @@ -128,7 +136,10 @@ preserved) or hand it to `lerobot-annotate` via a config flag. "parameters": { "type": "object", "properties": { - "label": { "type": "string", "description": "Short label for the saved image." } + "label": { + "type": "string", + "description": "Short label for the saved image." + } }, "required": ["label"] } @@ -183,7 +194,7 @@ That's it. At runtime `get_tools(meta)` looks up each schema in `meta.tools`, instantiates the matching registered class, and returns a name → instance dict the dispatcher can route into. -If you want to use a tool *without* writing an implementation (e.g. for +If you want to use a tool _without_ writing an implementation (e.g. for training-time chat-template formatting only), step 1 alone is enough — -the model still learns to *generate* the call. Steps 2 and 3 are only -needed to actually *execute* it at inference. +the model still learns to _generate_ the call. Steps 2 and 3 are only +needed to actually _execute_ it at inference. diff --git a/tests/datasets/test_language_render.py b/tests/datasets/test_language_render.py index ca27089fc..d8befecac 100644 --- a/tests/datasets/test_language_render.py +++ b/tests/datasets/test_language_render.py @@ -205,12 +205,8 @@ def test_per_camera_blend_renders_both_views(): "top": TrainingRecipe( weight=1.0, bindings={ - "vqa_query": ( - "emitted_at(t, style=vqa, role=user, camera=observation.images.top)" - ), - "vqa": ( - "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)" - ), + "vqa_query": ("emitted_at(t, style=vqa, role=user, camera=observation.images.top)"), + "vqa": ("emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"), }, messages=[ MessageTurn( @@ -234,12 +230,8 @@ def test_per_camera_blend_renders_both_views(): "wrist": TrainingRecipe( weight=1.0, bindings={ - "vqa_query": ( - "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)" - ), - "vqa": ( - "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)" - ), + "vqa_query": ("emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"), + "vqa": ("emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"), }, messages=[ MessageTurn( @@ -317,11 +309,19 @@ def test_resolve_task_picks_rephrasing_deterministically_per_sample(): assert seen == {r["content"] for r in rephrasings} # Same sample_idx → same pick (determinism). a = render_sample( - recipe=recipe, persistent=rephrasings, events=[], t=0.0, sample_idx=42, + recipe=recipe, + persistent=rephrasings, + events=[], + t=0.0, + sample_idx=42, dataset_ctx={"task": "canonical"}, ) b = render_sample( - recipe=recipe, persistent=rephrasings, events=[], t=0.0, sample_idx=42, + recipe=recipe, + persistent=rephrasings, + events=[], + t=0.0, + sample_idx=42, dataset_ctx={"task": "canonical"}, ) assert a["messages"][0]["content"] == b["messages"][0]["content"]