diff --git a/src/lerobot/configs/recipes/subtask_mem.yaml b/src/lerobot/configs/recipes/subtask_mem.yaml index 0b73eaac0..3dbed98ff 100644 --- a/src/lerobot/configs/recipes/subtask_mem.yaml +++ b/src/lerobot/configs/recipes/subtask_mem.yaml @@ -35,7 +35,7 @@ blend: - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} low_level_execution: - weight: 0.40 + weight: 0.55 messages: # The action expert is conditioned on the SUBTASK — at inference # `HighLevelSubtaskFwd` generates it via the LM head and feeds it @@ -45,10 +45,22 @@ blend: - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} memory_update: - weight: 0.30 + # At inference, `MemoryUpdateFwd` is triggered only on + # `subtask_change` events (sparse). Training densely with + # `active_at` — i.e. on every frame inside a subtask interval, + # not just the boundary frame — supervises the same + # (prior_memory, completed_subtask) → current_memory mapping + # against varied observations within the interval. The model + # learns a stateless transformation; the *when* to emit lives in + # the inference trigger, not the model. Annotations only exist + # for ~1% of frames as boundary events, so `emitted_at` would + # waste 99% of the blend draws (and silently leak them into a + # task-conditioned fallback); `active_at` lifts the renderable + # rate to ~87% on this dataset. + weight: 0.15 bindings: prior_memory: "nth_prev(style=memory, offset=1)" - current_memory: "emitted_at(t, style=memory)" + current_memory: "active_at(t, style=memory)" completed_subtask: "nth_prev(style=subtask, offset=1)" messages: - {role: user, content: "${task}", stream: high_level} diff --git a/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml index 4081e6972..366dcaa16 100644 --- a/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml +++ b/src/lerobot/configs/recipes/subtask_mem_vqa_speech.yaml @@ -45,10 +45,22 @@ blend: - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} memory_update: + # At inference, `MemoryUpdateFwd` is triggered only on + # `subtask_change` events (sparse). Training densely with + # `active_at` — i.e. on every frame inside a subtask interval, + # not just the boundary frame — supervises the same + # (prior_memory, completed_subtask) → current_memory mapping + # against varied observations within the interval. The model + # learns a stateless transformation; the *when* to emit lives in + # the inference trigger, not the model. Annotations only exist + # for ~1% of frames as boundary events, so `emitted_at` would + # waste 99% of the blend draws (and silently leak them into the + # task-conditioned fallback); `active_at` lifts the renderable + # rate to ~87% on Hi-Robot-style datasets. weight: 0.10 bindings: prior_memory: "nth_prev(style=memory, offset=1)" - current_memory: "emitted_at(t, style=memory)" + current_memory: "active_at(t, style=memory)" completed_subtask: "nth_prev(style=subtask, offset=1)" messages: - {role: user, content: "${task}", stream: high_level}