From 7a68bf13d95b141ca9301a35510310c6a1ebc1fe Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 14:21:41 +0200 Subject: [PATCH] =?UTF-8?q?feat(recipes):=20add=20hirobot=5Fmemory=20?= =?UTF-8?q?=E2=80=94=20hirobot=20+=20memory=20+=20spoken=20tool-call=20rep?= =?UTF-8?q?lies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New recipe alongside hirobot.yaml (kept as the lean baseline). Superset that adds two text-supervised sub-recipes: - memory_update: compress progress into a memory note. - user_interjection_response: reply to a user interjection with a `say` tool call only (no plan/subtask text). The SmolVLA2 chat tokenizer flattens the call to a `...` marker the runtime parses back. Plan is intentionally omitted; memory is the only persistent high-level state. Weights: low_level 0.40, subtask 0.25, memory 0.10, interjection 0.10, vqa 0.075 x2. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/recipes/hirobot_memory.yaml | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 src/lerobot/configs/recipes/hirobot_memory.yaml diff --git a/src/lerobot/configs/recipes/hirobot_memory.yaml b/src/lerobot/configs/recipes/hirobot_memory.yaml new file mode 100644 index 000000000..2470c65cf --- /dev/null +++ b/src/lerobot/configs/recipes/hirobot_memory.yaml @@ -0,0 +1,103 @@ +# Hi-Robot blend + memory + tool-call (spoken) responses. +# +# Superset of hirobot.yaml. Keeps the core subtask + action + VQA +# training, and adds two text-supervised tasks: +# +# high_level_subtask — predict the subtask from the task. +# low_level_execution — flow loss with [images, subtask, state]. +# memory_update — compress progress into a memory note. +# user_interjection_response — reply to a user interjection with a +# spoken `say` tool call (no plan, no +# subtask text — just the spoken reply). +# ask_vqa_{top,wrist} — camera-grounded VQA. +# +# Plan is intentionally left out — memory is the only persistent +# high-level state here, keeping the prompt short. +# +# Requires the dataset to carry `memory`, `interjection` and `say`-tool +# annotations (the annotation pipeline's memory + interjection modules) +# in addition to `subtask` and `vqa`. Sub-recipes whose `if_present` +# bindings are missing simply don't render for that sample, so a +# dataset without interjections still trains the rest of the blend. +# +# SmolVLA2 note: the `say` tool call on the interjection-response turn +# is flattened to a `...` text marker by the chat tokenizer +# (`_flatten_say_tool_calls`) before `apply_chat_template`, so the LM +# head learns to emit exactly the marker the runtime parses back +# (`_split_plan_and_say`). + +blend: + + high_level_subtask: + weight: 0.25 + messages: + - {role: user, content: "${task}", stream: high_level} + - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} + + low_level_execution: + weight: 0.40 + messages: + # The action expert is conditioned on the SUBTASK — at inference + # `HighLevelSubtaskFwd` generates it via the LM head and feeds it + # here. `stream: low_level` flips `predict_actions=True` so the + # flow loss fires; no text-CE target (subtask prediction is owned + # by `high_level_subtask`). + - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} + + memory_update: + weight: 0.10 + bindings: + prior_memory: "nth_prev(style=memory, offset=1)" + current_memory: "emitted_at(t, style=memory)" + completed_subtask: "nth_prev(style=subtask, offset=1)" + messages: + - {role: user, content: "${task}", stream: high_level} + - {role: assistant, content: "Previous memory: ${prior_memory}", stream: high_level, if_present: prior_memory} + - {role: user, content: "Completed subtask: ${completed_subtask}", stream: high_level, if_present: completed_subtask} + - {role: assistant, content: "${current_memory}", stream: high_level, target: true, if_present: current_memory} + + user_interjection_response: + weight: 0.10 + bindings: + interjection: "emitted_at(t, style=interjection)" + speech: "emitted_at(t, role=assistant, tool_name=say)" + messages: + - {role: user, content: "${task}", stream: high_level} + - {role: user, content: "${interjection}", stream: high_level, if_present: interjection} + # Spoken reply only: the assistant turn carries no text content, + # just a `say` tool call (`tool_calls_from: speech`). The chat + # tokenizer flattens it to a `...` marker, so the + # supervised target trains the model to respond to an + # interjection with a spoken acknowledgement. + - {role: assistant, stream: high_level, target: true, if_present: speech, tool_calls_from: speech} + + # VQA is view-dependent — each camera gets its own sub-recipe so the + # resolver disambiguates via `camera=...`. Camera keys match + # hirobot.yaml (`front` + `wrist`); adjust to your dataset. + ask_vqa_top: + weight: 0.075 + bindings: + vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)" + vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)" + messages: + - role: user + stream: high_level + if_present: vqa_query + content: + - {type: image, feature: observation.images.front} + - {type: text, text: "${vqa_query}"} + - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} + + ask_vqa_wrist: + weight: 0.075 + bindings: + vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)" + vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)" + messages: + - role: user + stream: high_level + if_present: vqa_query + content: + - {type: image, feature: observation.images.wrist} + - {type: text, text: "${vqa_query}"} + - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}