From 474c5478d93c3b42d7efc914bc7c326acfad0b6e Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 17:24:36 +0200 Subject: [PATCH] tune(annotations): VQA emission anchors a single frame (K 3 -> 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Module 3 anchored each VQA emission tick to K=3 consecutive frames (~0.1s at 30fps). The VLM grounds the answer — bbox/keypoint coordinates especially — against the first frame's image, so copying it onto frames 2-3 smears a stale label over a moving scene. Default K=1: a VQA pair lands on exactly its emission frame, no temporal smear. VQA frames get sparser; the WeightedEpisodeAwareSampler (vqa_target_fraction) is the knob to compensate. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/annotations/steerable_pipeline/config.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 678930784..8c0d48f0a 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -114,7 +114,14 @@ class Module3Config: enabled: bool = True vqa_emission_hz: float = 1.0 - K: int = 3 + K: int = 1 + """How many *consecutive* frames each emission tick anchors a VQA pair + to. The VLM grounds its answer (bbox / keypoint coordinates, count, …) + against the *first* anchored frame's image, so anchoring K>1 frames + copies that same answer onto later frames where the scene has already + moved — stale labels. Default ``1``: a VQA pair lands on exactly its + emission frame, no temporal smear. Raise it only to trade label + precision for more (noisier) VQA frames.""" question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")