From 471b2b1b1dee5fde8dba9482bc14bd56c09e8666 Mon Sep 17 00:00:00 2001 From: pepijn Date: Sat, 23 May 2026 19:31:44 +0000 Subject: [PATCH] fix(annotate): bump same-frame subtasks onto distinct frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If two consecutive VLM-emitted subtask spans have ``start`` timestamps that round to the same source frame after ``snap_to_frame`` (e.g. on short episodes the VLM sometimes nominates two ~adjacent action boundaries within one 30 Hz step), the writer emits two ``style=subtask`` rows at the identical persistent timestamp. The training-time renderer's default binding ``subtask: active_at(t, style=subtask)`` then raises: ValueError: Ambiguous resolver for style='subtask'; add role=..., tool_name=..., or camera=... to disambiguate. … and the whole training run dies on the first batch. Observed concretely on ``pepijn223/super_poulain_vocab2`` (job 22159979): episodes 3 and 30 each had two subtask rows at the same timestamp (``release yellow cube`` + ``retract arm`` snapping to the same frame). Add ``_dedupe_starts_to_distinct_frames`` to walk the cleaned span list and, whenever a snapped start collides with one already used, push the later span onto the next free frame timestamp. Both subtasks survive on distinct timestamps; the renderer can now disambiguate. If the episode genuinely has no later free frame (extremely unlikely — would require a same-timestamp collision on the very last frame of the episode), the later span is dropped with a warning rather than left to poison the render. New test ``test_plan_module_bumps_collocated_subtasks_to_distinct_frames`` locks in the contract; full vocabulary suite is 14/14 green. Co-Authored-By: Claude Opus 4.7 (1M context) Co-authored-by: Cursor --- .../modules/plan_subtasks_memory.py | 49 +++++++++++++++++++ tests/annotations/test_vocabulary.py | 45 +++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index 7d55a5d8d..b9bae607e 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -366,6 +366,7 @@ class PlanSubtasksMemoryModule: continue cleaned.append({"text": text, "start": start, "end": end}) cleaned.sort(key=lambda s: s["start"]) + cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record) if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned: logger.warning( "episode %d: every VLM subtask was off-vocab even after retry — " @@ -375,6 +376,54 @@ class PlanSubtasksMemoryModule: ) return cleaned + @staticmethod + def _dedupe_starts_to_distinct_frames( + spans: list[dict[str, Any]], record: EpisodeRecord + ) -> list[dict[str, Any]]: + """Bump same-frame subtask starts onto distinct frames. + + Two consecutive VLM spans whose ``start`` rounds to the same + source frame (after :func:`snap_to_frame`) would otherwise emit + two ``style=subtask`` rows at the identical persistent + timestamp. The training-time renderer's ``active_at(t, + style=subtask)`` resolver can't disambiguate that and raises + ``Ambiguous resolver for style='subtask'``. + + Walk the (sorted-by-start) spans, snap each to its frame, and + if the snapped frame is already taken push the span onto the + next unused frame so both subtasks survive on distinct + timestamps. If the episode ends before a free frame is found, + the trailing span is dropped with a warning — better than + poisoning the render. + """ + if not spans: + return spans + frames = record.frame_timestamps + if not frames: + return spans + used: set[float] = set() + out: list[dict[str, Any]] = [] + for span in spans: + ts = snap_to_frame(span["start"], frames) + if ts in used: + next_ts = next((f for f in frames if f > ts and f not in used), None) + if next_ts is None: + logger.warning( + "episode %d: subtask %r snapped to occupied frame " + "%.3f and no free later frame exists — dropping", + record.episode_index, + span.get("text"), + ts, + ) + continue + ts = next_ts + used.add(ts) + new_span = {**span, "start": ts} + if float(new_span.get("end", ts)) < ts: + new_span["end"] = ts + out.append(new_span) + return out + # ------------------------------------------------------------------ # Canonical-vocabulary helpers # ------------------------------------------------------------------ diff --git a/tests/annotations/test_vocabulary.py b/tests/annotations/test_vocabulary.py index 1f1c046fe..7b820834d 100644 --- a/tests/annotations/test_vocabulary.py +++ b/tests/annotations/test_vocabulary.py @@ -309,6 +309,51 @@ def test_plan_module_drops_off_vocab_subtask_after_retry( assert subtask_texts == ["grasp blue cube"] +def test_plan_module_bumps_collocated_subtasks_to_distinct_frames( + fixture_dataset_root: Path, tmp_path: Path +) -> None: + """Two subtasks whose starts snap to the same frame get split onto two frames. + + Without this guard, both spans would emit ``style=subtask`` rows at the + identical persistent timestamp; the training-time renderer's + ``active_at(t, style=subtask)`` then raises an ambiguity error. + """ + from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient + + def responder(_messages): + # Two canonical labels with starts within one frame of each other — + # both snap to the same source frame, so the dedupe pass must bump + # the later one to the next frame. + return { + "subtasks": [ + {"text": "grasp blue cube", "start": 0.40, "end": 0.42}, + {"text": "place blue cube in box", "start": 0.41, "end": 0.50}, + ] + } + + vlm = StubVlmClient(responder=responder) + vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY) + module = PlanSubtasksMemoryModule( + vlm=vlm, + config=PlanConfig(n_task_rephrasings=0), + vocabulary=vocab, + ) + record = next(iter_episodes(fixture_dataset_root)) + staging = EpisodeStaging(tmp_path / "stage", record.episode_index) + module.run_episode(record, staging) + rows = staging.read("plan") + subtask_rows = [r for r in rows if r["style"] == "subtask"] + # Both subtasks present, both on distinct timestamps. + assert len(subtask_rows) == 2 + timestamps = [r["timestamp"] for r in subtask_rows] + assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}" + # Order preserved: the chronologically earlier span keeps the earlier + # frame, the later one was bumped onto the next available frame. + assert subtask_rows[0]["content"] == "grasp blue cube" + assert subtask_rows[1]["content"] == "place blue cube in box" + assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"] + + def test_plan_module_empty_when_all_off_vocab_after_retry( fixture_dataset_root: Path, tmp_path: Path ) -> None: