From e050d0fe0ad583cb22796f1009c90f38b9553836 Mon Sep 17 00:00:00 2001
From: pepijn <pepijn@huggingface.co>
Date: Thu, 21 May 2026 14:53:13 +0000
Subject: [PATCH] fix(recipes): use active_at for memory_update, rebalance
 subtask_mem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

memory_update was bound to `emitted_at(t, style=memory)`, which requires
the frame's exact timestamp to match a memory annotation. Memory rows are
placed at subtask-boundary timestamps and at 30 fps that's ~1% of frames,
so 99% of memory_update draws couldn't render and silently fell through
to _fallback_low_level_render — injecting task-conditioned low-level
training on ~30% of samples (subtask_mem.yaml).

Switch to `active_at`. At inference `MemoryUpdateFwd` is triggered on
`subtask_change` events, but the model only needs to learn the stateless
mapping (prior_memory, completed_subtask) -> current_memory. active_at
supervises this mapping on every frame inside a subtask interval, against
varied observations; the trigger lives outside the model. Net effect:
memory_update renders on ~87% of frames, the fallback leak drops from
~30% to ~4%, and memory CE gets a meaningful (not 0.3%) training share.

subtask_mem.yaml: rebalance to 0.30 / 0.55 / 0.15 so memory CE is
~13% effective and the freed weight goes to low_level_execution.
subtask_mem_vqa_speech.yaml: keep weights (memory_update=0.10 was
already balanced against the other text-CE branches).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/lerobot/configs/recipes/subtask_mem.yaml   | 18 +++++++++++++++---
 .../recipes/subtask_mem_vqa_speech.yaml        | 14 +++++++++++++-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/lerobot/configs/recipes/subtask_mem.yaml b/src/lerobot/configs/recipes/subtask_mem.yaml
index 0b73eaac0..3dbed98ff 100644
--- a/src/lerobot/configs/recipes/subtask_mem.yaml
+++ b/src/lerobot/configs/recipes/subtask_mem.yaml
@@ -35,7 +35,7 @@ blend:
       - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
 
   low_level_execution:
-    weight: 0.40
+    weight: 0.55
     messages:
       # The action expert is conditioned on the SUBTASK — at inference
       # `HighLevelSubtaskFwd` generates it via the LM head and feeds it
@@ -45,10 +45,22 @@ blend:
       - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
 
   memory_update:
-    weight: 0.30
+    # At inference, `MemoryUpdateFwd` is triggered only on
+    # `subtask_change` events (sparse). Training densely with
+    # `active_at` — i.e. on every frame inside a subtask interval,
+    # not just the boundary frame — supervises the same
+    # (prior_memory, completed_subtask) → current_memory mapping
+    # against varied observations within the interval. The model
+    # learns a stateless transformation; the *when* to emit lives in
+    # the inference trigger, not the model. Annotations only exist
+    # for ~1% of frames as boundary events, so `emitted_at` would
+    # waste 99% of the blend draws (and silently leak them into a
+    # task-conditioned fallback); `active_at` lifts the renderable
+    # rate to ~87% on this dataset.
+    weight: 0.15
     bindings:
       prior_memory: "nth_prev(style=memory, offset=1)"
-      current_memory: "emitted_at(t, style=memory)"
+      current_memory: "active_at(t, style=memory)"
       completed_subtask: "nth_prev(style=subtask, offset=1)"
     messages:
       - {role: user, content: "${task}", stream: high_level}
diff --git a/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
index 4081e6972..366dcaa16 100644
--- a/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
+++ b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml
@@ -45,10 +45,22 @@ blend:
       - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
 
   memory_update:
+    # At inference, `MemoryUpdateFwd` is triggered only on
+    # `subtask_change` events (sparse). Training densely with
+    # `active_at` — i.e. on every frame inside a subtask interval,
+    # not just the boundary frame — supervises the same
+    # (prior_memory, completed_subtask) → current_memory mapping
+    # against varied observations within the interval. The model
+    # learns a stateless transformation; the *when* to emit lives in
+    # the inference trigger, not the model. Annotations only exist
+    # for ~1% of frames as boundary events, so `emitted_at` would
+    # waste 99% of the blend draws (and silently leak them into the
+    # task-conditioned fallback); `active_at` lifts the renderable
+    # rate to ~87% on Hi-Robot-style datasets.
     weight: 0.10
     bindings:
       prior_memory: "nth_prev(style=memory, offset=1)"
-      current_memory: "emitted_at(t, style=memory)"
+      current_memory: "active_at(t, style=memory)"
       completed_subtask: "nth_prev(style=subtask, offset=1)"
     messages:
       - {role: user, content: "${task}", stream: high_level}