From 7a68bf13d95b141ca9301a35510310c6a1ebc1fe Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 14:21:41 +0200
Subject: [PATCH] =?UTF-8?q?feat(recipes):=20add=20hirobot=5Fmemory=20?=
 =?UTF-8?q?=E2=80=94=20hirobot=20+=20memory=20+=20spoken=20tool-call=20rep?=
 =?UTF-8?q?lies?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New recipe alongside hirobot.yaml (kept as the lean baseline). Superset
that adds two text-supervised sub-recipes:

- memory_update: compress progress into a memory note.
- user_interjection_response: reply to a user interjection with a `say`
  tool call only (no plan/subtask text). The SmolVLA2 chat tokenizer
  flattens the call to a `<say>...</say>` marker the runtime parses back.

Plan is intentionally omitted; memory is the only persistent high-level
state. Weights: low_level 0.40, subtask 0.25, memory 0.10, interjection
0.10, vqa 0.075 x2.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../configs/recipes/hirobot_memory.yaml       | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 src/lerobot/configs/recipes/hirobot_memory.yaml
diff --git a/src/lerobot/configs/recipes/hirobot_memory.yaml b/src/lerobot/configs/recipes/hirobot_memory.yaml
new file mode 100644
index 000000000..2470c65cf
--- /dev/null
+++ b/src/lerobot/configs/recipes/hirobot_memory.yaml
@@ -0,0 +1,103 @@
+# Hi-Robot blend + memory + tool-call (spoken) responses.
+#
+# Superset of hirobot.yaml. Keeps the core subtask + action + VQA
+# training, and adds two text-supervised tasks:
+#
+#   high_level_subtask         — predict the subtask from the task.
+#   low_level_execution        — flow loss with [images, subtask, state].
+#   memory_update              — compress progress into a memory note.
+#   user_interjection_response — reply to a user interjection with a
+#                                spoken `say` tool call (no plan, no
+#                                subtask text — just the spoken reply).
+#   ask_vqa_{top,wrist}        — camera-grounded VQA.
+#
+# Plan is intentionally left out — memory is the only persistent
+# high-level state here, keeping the prompt short.
+#
+# Requires the dataset to carry `memory`, `interjection` and `say`-tool
+# annotations (the annotation pipeline's memory + interjection modules)
+# in addition to `subtask` and `vqa`. Sub-recipes whose `if_present`
+# bindings are missing simply don't render for that sample, so a
+# dataset without interjections still trains the rest of the blend.
+#
+# SmolVLA2 note: the `say` tool call on the interjection-response turn
+# is flattened to a `<say>...</say>` text marker by the chat tokenizer
+# (`_flatten_say_tool_calls`) before `apply_chat_template`, so the LM
+# head learns to emit exactly the marker the runtime parses back
+# (`_split_plan_and_say`).
+
+blend:
+
+  high_level_subtask:
+    weight: 0.25
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
+
+  low_level_execution:
+    weight: 0.40
+    messages:
+      # The action expert is conditioned on the SUBTASK — at inference
+      # `HighLevelSubtaskFwd` generates it via the LM head and feeds it
+      # here. `stream: low_level` flips `predict_actions=True` so the
+      # flow loss fires; no text-CE target (subtask prediction is owned
+      # by `high_level_subtask`).
+      - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
+
+  memory_update:
+    weight: 0.10
+    bindings:
+      prior_memory: "nth_prev(style=memory, offset=1)"
+      current_memory: "emitted_at(t, style=memory)"
+      completed_subtask: "nth_prev(style=subtask, offset=1)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: assistant, content: "Previous memory: ${prior_memory}", stream: high_level, if_present: prior_memory}
+      - {role: user, content: "Completed subtask: ${completed_subtask}", stream: high_level, if_present: completed_subtask}
+      - {role: assistant, content: "${current_memory}", stream: high_level, target: true, if_present: current_memory}
+
+  user_interjection_response:
+    weight: 0.10
+    bindings:
+      interjection: "emitted_at(t, style=interjection)"
+      speech: "emitted_at(t, role=assistant, tool_name=say)"
+    messages:
+      - {role: user, content: "${task}", stream: high_level}
+      - {role: user, content: "${interjection}", stream: high_level, if_present: interjection}
+      # Spoken reply only: the assistant turn carries no text content,
+      # just a `say` tool call (`tool_calls_from: speech`). The chat
+      # tokenizer flattens it to a `<say>...</say>` marker, so the
+      # supervised target trains the model to respond to an
+      # interjection with a spoken acknowledgement.
+      - {role: assistant, stream: high_level, target: true, if_present: speech, tool_calls_from: speech}
+
+  # VQA is view-dependent — each camera gets its own sub-recipe so the
+  # resolver disambiguates via `camera=...`. Camera keys match
+  # hirobot.yaml (`front` + `wrist`); adjust to your dataset.
+  ask_vqa_top:
+    weight: 0.075
+    bindings:
+      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
+      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
+    messages:
+      - role: user
+        stream: high_level
+        if_present: vqa_query
+        content:
+          - {type: image, feature: observation.images.front}
+          - {type: text, text: "${vqa_query}"}
+      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
+
+  ask_vqa_wrist:
+    weight: 0.075
+    bindings:
+      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
+      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
+    messages:
+      - role: user
+        stream: high_level
+        if_present: vqa_query
+        content:
+          - {type: image, feature: observation.images.wrist}
+          - {type: text, text: "${vqa_query}"}
+      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}