diff --git a/src/lerobot/configs/recipes/hirobot_memory.yaml b/src/lerobot/configs/recipes/hirobot_memory.yaml
new file mode 100644
index 000000000..2470c65cf
--- /dev/null
+++ b/src/lerobot/configs/recipes/hirobot_memory.yaml
@@ -0,0 +1,103 @@
+# Hi-Robot blend + memory + tool-call (spoken) responses.
+#
+# Superset of hirobot.yaml. Keeps the core subtask + action + VQA
+# training, and adds two text-supervised tasks:
+#
+# high_level_subtask — predict the subtask from the task.
+# low_level_execution — flow loss with [images, subtask, state].
+# memory_update — compress progress into a memory note.
+# user_interjection_response — reply to a user interjection with a
+# spoken `say` tool call (no plan, no
+# subtask text — just the spoken reply).
+# ask_vqa_{top,wrist} — camera-grounded VQA.
+#
+# Plan is intentionally left out — memory is the only persistent
+# high-level state here, keeping the prompt short.
+#
+# Requires the dataset to carry `memory`, `interjection` and `say`-tool
+# annotations (the annotation pipeline's memory + interjection modules)
+# in addition to `subtask` and `vqa`. Sub-recipes whose `if_present`
+# bindings are missing simply don't render for that sample, so a
+# dataset without interjections still trains the rest of the blend.
+#
+# SmolVLA2 note: the `say` tool call on the interjection-response turn
+# is flattened to a `...` text marker by the chat tokenizer
+# (`_flatten_say_tool_calls`) before `apply_chat_template`, so the LM
+# head learns to emit exactly the marker the runtime parses back
+# (`_split_plan_and_say`).
+
+blend:
+
+ high_level_subtask:
+ weight: 0.25
+ messages:
+ - {role: user, content: "${task}", stream: high_level}
+ - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
+
+ low_level_execution:
+ weight: 0.40
+ messages:
+ # The action expert is conditioned on the SUBTASK — at inference
+ # `HighLevelSubtaskFwd` generates it via the LM head and feeds it
+ # here. `stream: low_level` flips `predict_actions=True` so the
+ # flow loss fires; no text-CE target (subtask prediction is owned
+ # by `high_level_subtask`).
+ - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
+
+ memory_update:
+ weight: 0.10
+ bindings:
+ prior_memory: "nth_prev(style=memory, offset=1)"
+ current_memory: "emitted_at(t, style=memory)"
+ completed_subtask: "nth_prev(style=subtask, offset=1)"
+ messages:
+ - {role: user, content: "${task}", stream: high_level}
+ - {role: assistant, content: "Previous memory: ${prior_memory}", stream: high_level, if_present: prior_memory}
+ - {role: user, content: "Completed subtask: ${completed_subtask}", stream: high_level, if_present: completed_subtask}
+ - {role: assistant, content: "${current_memory}", stream: high_level, target: true, if_present: current_memory}
+
+ user_interjection_response:
+ weight: 0.10
+ bindings:
+ interjection: "emitted_at(t, style=interjection)"
+ speech: "emitted_at(t, role=assistant, tool_name=say)"
+ messages:
+ - {role: user, content: "${task}", stream: high_level}
+ - {role: user, content: "${interjection}", stream: high_level, if_present: interjection}
+ # Spoken reply only: the assistant turn carries no text content,
+ # just a `say` tool call (`tool_calls_from: speech`). The chat
+ # tokenizer flattens it to a `...` marker, so the
+ # supervised target trains the model to respond to an
+ # interjection with a spoken acknowledgement.
+ - {role: assistant, stream: high_level, target: true, if_present: speech, tool_calls_from: speech}
+
+ # VQA is view-dependent — each camera gets its own sub-recipe so the
+ # resolver disambiguates via `camera=...`. Camera keys match
+ # hirobot.yaml (`front` + `wrist`); adjust to your dataset.
+ ask_vqa_top:
+ weight: 0.075
+ bindings:
+ vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
+ vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
+ messages:
+ - role: user
+ stream: high_level
+ if_present: vqa_query
+ content:
+ - {type: image, feature: observation.images.front}
+ - {type: text, text: "${vqa_query}"}
+ - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
+
+ ask_vqa_wrist:
+ weight: 0.075
+ bindings:
+ vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
+ vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
+ messages:
+ - role: user
+ stream: high_level
+ if_present: vqa_query
+ content:
+ - {type: image, feature: observation.images.wrist}
+ - {type: text, text: "${vqa_query}"}
+ - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}