From 474c5478d93c3b42d7efc914bc7c326acfad0b6e Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 17:24:36 +0200
Subject: [PATCH] tune(annotations): VQA emission anchors a single frame (K 3
 -> 1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Module 3 anchored each VQA emission tick to K=3 consecutive frames
(~0.1s at 30fps). The VLM grounds the answer — bbox/keypoint
coordinates especially — against the first frame's image, so copying it
onto frames 2-3 smears a stale label over a moving scene.

Default K=1: a VQA pair lands on exactly its emission frame, no
temporal smear. VQA frames get sparser; the WeightedEpisodeAwareSampler
(vqa_target_fraction) is the knob to compensate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/annotations/steerable_pipeline/config.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 678930784..8c0d48f0a 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -114,7 +114,14 @@ class Module3Config:
 
     enabled: bool = True
     vqa_emission_hz: float = 1.0
-    K: int = 3
+    K: int = 1
+    """How many *consecutive* frames each emission tick anchors a VQA pair
+    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
+    against the *first* anchored frame's image, so anchoring K>1 frames
+    copies that same answer onto later frames where the scene has already
+    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
+    emission frame, no temporal smear. Raise it only to trade label
+    precision for more (noisier) VQA frames."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")