fix(annotate): bump same-frame subtasks onto distinct frames

If two consecutive VLM-emitted subtask spans have ``start`` timestamps that round to the same source frame after ``snap_to_frame`` (e.g. on short episodes the VLM sometimes nominates two ~adjacent action boundaries within one 30 Hz step), the writer emits two ``style=subtask`` rows at the identical persistent timestamp. The training-time renderer's default binding ``subtask: active_at(t, style=subtask)`` then raises: ValueError: Ambiguous resolver for style='subtask'; add role=..., tool_name=..., or camera=... to disambiguate. … and the whole training run dies on the first batch. Observed concretely on ``pepijn223/super_poulain_vocab2`` (job 22159979): episodes 3 and 30 each had two subtask rows at the same timestamp (``release yellow cube`` + ``retract arm`` snapping to the same frame). Add ``_dedupe_starts_to_distinct_frames`` to walk the cleaned span list and, whenever a snapped start collides with one already used, push the later span onto the next free frame timestamp. Both subtasks survive on distinct timestamps; the renderer can now disambiguate. If the episode genuinely has no later free frame (extremely unlikely — would require a same-timestamp collision on the very last frame of the episode), the later span is dropped with a warning rather than left to poison the render. New test ``test_plan_module_bumps_collocated_subtasks_to_distinct_frames`` locks in the contract; full vocabulary suite is 14/14 green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-07-10 11:31:57 +00:00 · 2026-05-23 19:31:44 +00:00
parent a15e16c072
commit 471b2b1b1d
2 changed files with 94 additions and 0 deletions
@@ -366,6 +366,7 @@ class PlanSubtasksMemoryModule:
                continue
            cleaned.append({"text": text, "start": start, "end": end})
        cleaned.sort(key=lambda s: s["start"])
+        cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
        if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
            logger.warning(
                "episode %d: every VLM subtask was off-vocab even after retry — "
@@ -375,6 +376,54 @@ class PlanSubtasksMemoryModule:
            )
        return cleaned

+    @staticmethod
+    def _dedupe_starts_to_distinct_frames(
+        spans: list[dict[str, Any]], record: EpisodeRecord
+    ) -> list[dict[str, Any]]:
+        """Bump same-frame subtask starts onto distinct frames.
+
+        Two consecutive VLM spans whose ``start`` rounds to the same
+        source frame (after :func:`snap_to_frame`) would otherwise emit
+        two ``style=subtask`` rows at the identical persistent
+        timestamp. The training-time renderer's ``active_at(t,
+        style=subtask)`` resolver can't disambiguate that and raises
+        ``Ambiguous resolver for style='subtask'``.
+
+        Walk the (sorted-by-start) spans, snap each to its frame, and
+        if the snapped frame is already taken push the span onto the
+        next unused frame so both subtasks survive on distinct
+        timestamps. If the episode ends before a free frame is found,
+        the trailing span is dropped with a warning — better than
+        poisoning the render.
+        """
+        if not spans:
+            return spans
+        frames = record.frame_timestamps
+        if not frames:
+            return spans
+        used: set[float] = set()
+        out: list[dict[str, Any]] = []
+        for span in spans:
+            ts = snap_to_frame(span["start"], frames)
+            if ts in used:
+                next_ts = next((f for f in frames if f > ts and f not in used), None)
+                if next_ts is None:
+                    logger.warning(
+                        "episode %d: subtask %r snapped to occupied frame "
+                        "%.3f and no free later frame exists — dropping",
+                        record.episode_index,
+                        span.get("text"),
+                        ts,
+                    )
+                    continue
+                ts = next_ts
+            used.add(ts)
+            new_span = {**span, "start": ts}
+            if float(new_span.get("end", ts)) < ts:
+                new_span["end"] = ts
+            out.append(new_span)
+        return out
+
    # ------------------------------------------------------------------
    # Canonical-vocabulary helpers
    # ------------------------------------------------------------------
@@ -309,6 +309,51 @@ def test_plan_module_drops_off_vocab_subtask_after_retry(
    assert subtask_texts == ["grasp blue cube"]


+def test_plan_module_bumps_collocated_subtasks_to_distinct_frames(
+    fixture_dataset_root: Path, tmp_path: Path
+) -> None:
+    """Two subtasks whose starts snap to the same frame get split onto two frames.
+
+    Without this guard, both spans would emit ``style=subtask`` rows at the
+    identical persistent timestamp; the training-time renderer's
+    ``active_at(t, style=subtask)`` then raises an ambiguity error.
+    """
+    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
+
+    def responder(_messages):
+        # Two canonical labels with starts within one frame of each other —
+        # both snap to the same source frame, so the dedupe pass must bump
+        # the later one to the next frame.
+        return {
+            "subtasks": [
+                {"text": "grasp blue cube", "start": 0.40, "end": 0.42},
+                {"text": "place blue cube in box", "start": 0.41, "end": 0.50},
+            ]
+        }
+
+    vlm = StubVlmClient(responder=responder)
+    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
+    module = PlanSubtasksMemoryModule(
+        vlm=vlm,
+        config=PlanConfig(n_task_rephrasings=0),
+        vocabulary=vocab,
+    )
+    record = next(iter_episodes(fixture_dataset_root))
+    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
+    module.run_episode(record, staging)
+    rows = staging.read("plan")
+    subtask_rows = [r for r in rows if r["style"] == "subtask"]
+    # Both subtasks present, both on distinct timestamps.
+    assert len(subtask_rows) == 2
+    timestamps = [r["timestamp"] for r in subtask_rows]
+    assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}"
+    # Order preserved: the chronologically earlier span keeps the earlier
+    # frame, the later one was bumped onto the next available frame.
+    assert subtask_rows[0]["content"] == "grasp blue cube"
+    assert subtask_rows[1]["content"] == "place blue cube in box"
+    assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"]
+
+
 def test_plan_module_empty_when_all_off_vocab_after_retry(
    fixture_dataset_root: Path, tmp_path: Path
 ) -> None: