mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-25 05:29:55 +00:00
Apply ruff and prettier formatting after merge
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -100,7 +100,13 @@ ask_vqa_top:
|
|||||||
content:
|
content:
|
||||||
- { type: image, feature: observation.images.top }
|
- { type: image, feature: observation.images.top }
|
||||||
- { type: text, text: "${vqa_query}" }
|
- { type: text, text: "${vqa_query}" }
|
||||||
- { role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa }
|
- {
|
||||||
|
role: assistant,
|
||||||
|
content: "${vqa}",
|
||||||
|
stream: high_level,
|
||||||
|
target: true,
|
||||||
|
if_present: vqa,
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Add one such sub-recipe per camera the dataset records.
|
Add one such sub-recipe per camera the dataset records.
|
||||||
|
|||||||
+23
-12
@@ -29,7 +29,10 @@ Two layers.
|
|||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"text": { "type": "string", "description": "The verbatim text to speak." }
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The verbatim text to speak."
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"required": ["text"]
|
"required": ["text"]
|
||||||
}
|
}
|
||||||
@@ -67,9 +70,9 @@ prompt_str = tokenizer.apply_chat_template(
|
|||||||
`src/lerobot/tools/`, one file per tool. The canonical `say`
|
`src/lerobot/tools/`, one file per tool. The canonical `say`
|
||||||
implementation wraps Kyutai's pocket-tts model.
|
implementation wraps Kyutai's pocket-tts model.
|
||||||
|
|
||||||
## Per-row tool *invocations*
|
## Per-row tool _invocations_
|
||||||
|
|
||||||
The catalog above describes *what can be called*. The actual *call* — the
|
The catalog above describes _what can be called_. The actual _call_ — the
|
||||||
function name plus the argument values — is stored per-row, on the
|
function name plus the argument values — is stored per-row, on the
|
||||||
assistant atoms in `language_events`:
|
assistant atoms in `language_events`:
|
||||||
|
|
||||||
@@ -94,13 +97,18 @@ user_interjection_response:
|
|||||||
bindings:
|
bindings:
|
||||||
speech: "emitted_at(t, role=assistant, tool_name=say)"
|
speech: "emitted_at(t, role=assistant, tool_name=say)"
|
||||||
messages:
|
messages:
|
||||||
- { role: user, content: "${task}", stream: high_level }
|
- { role: user, content: "${task}", stream: high_level }
|
||||||
- { role: assistant, content: "${current_plan}", stream: high_level,
|
- {
|
||||||
target: true, tool_calls_from: speech }
|
role: assistant,
|
||||||
|
content: "${current_plan}",
|
||||||
|
stream: high_level,
|
||||||
|
target: true,
|
||||||
|
tool_calls_from: speech,
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The model's training target is one assistant turn that carries both the
|
The model's training target is one assistant turn that carries both the
|
||||||
plan text *and* the `say` tool call. At inference, the runtime parses
|
plan text _and_ the `say` tool call. At inference, the runtime parses
|
||||||
the generated text back into structured `tool_calls` and dispatches to
|
the generated text back into structured `tool_calls` and dispatches to
|
||||||
the matching implementation.
|
the matching implementation.
|
||||||
|
|
||||||
@@ -113,7 +121,7 @@ loop.
|
|||||||
### Step 1 — declare the schema
|
### Step 1 — declare the schema
|
||||||
|
|
||||||
Add an entry under `meta/info.json["tools"]`. Either edit the file
|
Add an entry under `meta/info.json["tools"]`. Either edit the file
|
||||||
directly on disk *before* running the annotation pipeline (it'll be
|
directly on disk _before_ running the annotation pipeline (it'll be
|
||||||
preserved) or hand it to `lerobot-annotate` via a config flag.
|
preserved) or hand it to `lerobot-annotate` via a config flag.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
@@ -128,7 +136,10 @@ preserved) or hand it to `lerobot-annotate` via a config flag.
|
|||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"label": { "type": "string", "description": "Short label for the saved image." }
|
"label": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Short label for the saved image."
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"required": ["label"]
|
"required": ["label"]
|
||||||
}
|
}
|
||||||
@@ -183,7 +194,7 @@ That's it. At runtime `get_tools(meta)` looks up each schema in
|
|||||||
`meta.tools`, instantiates the matching registered class, and returns
|
`meta.tools`, instantiates the matching registered class, and returns
|
||||||
a name → instance dict the dispatcher can route into.
|
a name → instance dict the dispatcher can route into.
|
||||||
|
|
||||||
If you want to use a tool *without* writing an implementation (e.g. for
|
If you want to use a tool _without_ writing an implementation (e.g. for
|
||||||
training-time chat-template formatting only), step 1 alone is enough —
|
training-time chat-template formatting only), step 1 alone is enough —
|
||||||
the model still learns to *generate* the call. Steps 2 and 3 are only
|
the model still learns to _generate_ the call. Steps 2 and 3 are only
|
||||||
needed to actually *execute* it at inference.
|
needed to actually _execute_ it at inference.
|
||||||
|
|||||||
@@ -205,12 +205,8 @@ def test_per_camera_blend_renders_both_views():
|
|||||||
"top": TrainingRecipe(
|
"top": TrainingRecipe(
|
||||||
weight=1.0,
|
weight=1.0,
|
||||||
bindings={
|
bindings={
|
||||||
"vqa_query": (
|
"vqa_query": ("emitted_at(t, style=vqa, role=user, camera=observation.images.top)"),
|
||||||
"emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
|
"vqa": ("emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"),
|
||||||
),
|
|
||||||
"vqa": (
|
|
||||||
"emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
messages=[
|
messages=[
|
||||||
MessageTurn(
|
MessageTurn(
|
||||||
@@ -234,12 +230,8 @@ def test_per_camera_blend_renders_both_views():
|
|||||||
"wrist": TrainingRecipe(
|
"wrist": TrainingRecipe(
|
||||||
weight=1.0,
|
weight=1.0,
|
||||||
bindings={
|
bindings={
|
||||||
"vqa_query": (
|
"vqa_query": ("emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"),
|
||||||
"emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
"vqa": ("emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"),
|
||||||
),
|
|
||||||
"vqa": (
|
|
||||||
"emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
messages=[
|
messages=[
|
||||||
MessageTurn(
|
MessageTurn(
|
||||||
@@ -317,11 +309,19 @@ def test_resolve_task_picks_rephrasing_deterministically_per_sample():
|
|||||||
assert seen == {r["content"] for r in rephrasings}
|
assert seen == {r["content"] for r in rephrasings}
|
||||||
# Same sample_idx → same pick (determinism).
|
# Same sample_idx → same pick (determinism).
|
||||||
a = render_sample(
|
a = render_sample(
|
||||||
recipe=recipe, persistent=rephrasings, events=[], t=0.0, sample_idx=42,
|
recipe=recipe,
|
||||||
|
persistent=rephrasings,
|
||||||
|
events=[],
|
||||||
|
t=0.0,
|
||||||
|
sample_idx=42,
|
||||||
dataset_ctx={"task": "canonical"},
|
dataset_ctx={"task": "canonical"},
|
||||||
)
|
)
|
||||||
b = render_sample(
|
b = render_sample(
|
||||||
recipe=recipe, persistent=rephrasings, events=[], t=0.0, sample_idx=42,
|
recipe=recipe,
|
||||||
|
persistent=rephrasings,
|
||||||
|
events=[],
|
||||||
|
t=0.0,
|
||||||
|
sample_idx=42,
|
||||||
dataset_ctx={"task": "canonical"},
|
dataset_ctx={"task": "canonical"},
|
||||||
)
|
)
|
||||||
assert a["messages"][0]["content"] == b["messages"][0]["content"]
|
assert a["messages"][0]["content"] == b["messages"][0]["content"]
|
||||||
|
|||||||
Reference in New Issue
Block a user