fix(annotate): bump same-frame subtasks onto distinct frames

If two consecutive VLM-emitted subtask spans have ``start`` timestamps that round to the same source frame after ``snap_to_frame`` (e.g. on short episodes the VLM sometimes nominates two ~adjacent action boundaries within one 30 Hz step), the writer emits two ``style=subtask`` rows at the identical persistent timestamp. The training-time renderer's default binding ``subtask: active_at(t, style=subtask)`` then raises: ValueError: Ambiguous resolver for style='subtask'; add role=..., tool_name=..., or camera=... to disambiguate. … and the whole training run dies on the first batch. Observed concretely on ``pepijn223/super_poulain_vocab2`` (job 22159979): episodes 3 and 30 each had two subtask rows at the same timestamp (``release yellow cube`` + ``retract arm`` snapping to the same frame). Add ``_dedupe_starts_to_distinct_frames`` to walk the cleaned span list and, whenever a snapped start collides with one already used, push the later span onto the next free frame timestamp. Both subtasks survive on distinct timestamps; the renderer can now disambiguate. If the episode genuinely has no later free frame (extremely unlikely — would require a same-timestamp collision on the very last frame of the episode), the later span is dropped with a warning rather than left to poison the render. New test ``test_plan_module_bumps_collocated_subtasks_to_distinct_frames`` locks in the contract; full vocabulary suite is 14/14 green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-07-24 10:16:09 +00:00 · 2026-05-23 19:31:44 +00:00
parent a15e16c072
commit 471b2b1b1d
2 changed files with 94 additions and 0 deletions
@@ -366,6 +366,7 @@ class PlanSubtasksMemoryModule:
                continue
            cleaned.append({"text": text, "start": start, "end": end})
        cleaned.sort(key=lambda s: s["start"])
        cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
        if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
            logger.warning(
                "episode %d: every VLM subtask was off-vocab even after retry — "
@@ -375,6 +376,54 @@ class PlanSubtasksMemoryModule:
            )
        return cleaned
    @staticmethod
    def _dedupe_starts_to_distinct_frames(
        spans: list[dict[str, Any]], record: EpisodeRecord
    ) -> list[dict[str, Any]]:
        """Bump same-frame subtask starts onto distinct frames.
        Two consecutive VLM spans whose ``start`` rounds to the same
        source frame (after :func:`snap_to_frame`) would otherwise emit
        two ``style=subtask`` rows at the identical persistent
        timestamp. The training-time renderer's ``active_at(t,
        style=subtask)`` resolver can't disambiguate that and raises
        ``Ambiguous resolver for style='subtask'``.
        Walk the (sorted-by-start) spans, snap each to its frame, and
        if the snapped frame is already taken push the span onto the
        next unused frame so both subtasks survive on distinct
        timestamps. If the episode ends before a free frame is found,
        the trailing span is dropped with a warning — better than
        poisoning the render.
        """
        if not spans:
            return spans
        frames = record.frame_timestamps
        if not frames:
            return spans
        used: set[float] = set()
        out: list[dict[str, Any]] = []
        for span in spans:
            ts = snap_to_frame(span["start"], frames)
            if ts in used:
                next_ts = next((f for f in frames if f > ts and f not in used), None)
                if next_ts is None:
                    logger.warning(
                        "episode %d: subtask %r snapped to occupied frame "
                        "%.3f and no free later frame exists — dropping",
                        record.episode_index,
                        span.get("text"),
                        ts,
                    )
                    continue
                ts = next_ts
            used.add(ts)
            new_span = {**span, "start": ts}
            if float(new_span.get("end", ts)) < ts:
                new_span["end"] = ts
            out.append(new_span)
        return out
    # ------------------------------------------------------------------
    # Canonical-vocabulary helpers
    # ------------------------------------------------------------------
@@ -309,6 +309,51 @@ def test_plan_module_drops_off_vocab_subtask_after_retry(
    assert subtask_texts == ["grasp blue cube"]
 def test_plan_module_bumps_collocated_subtasks_to_distinct_frames(
    fixture_dataset_root: Path, tmp_path: Path
 ) -> None:
    """Two subtasks whose starts snap to the same frame get split onto two frames.
    Without this guard, both spans would emit ``style=subtask`` rows at the
    identical persistent timestamp; the training-time renderer's
    ``active_at(t, style=subtask)`` then raises an ambiguity error.
    """
    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
    def responder(_messages):
        # Two canonical labels with starts within one frame of each other —
        # both snap to the same source frame, so the dedupe pass must bump
        # the later one to the next frame.
        return {
            "subtasks": [
                {"text": "grasp blue cube", "start": 0.40, "end": 0.42},
                {"text": "place blue cube in box", "start": 0.41, "end": 0.50},
            ]
        }
    vlm = StubVlmClient(responder=responder)
    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
    module = PlanSubtasksMemoryModule(
        vlm=vlm,
        config=PlanConfig(n_task_rephrasings=0),
        vocabulary=vocab,
    )
    record = next(iter_episodes(fixture_dataset_root))
    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
    module.run_episode(record, staging)
    rows = staging.read("plan")
    subtask_rows = [r for r in rows if r["style"] == "subtask"]
    # Both subtasks present, both on distinct timestamps.
    assert len(subtask_rows) == 2
    timestamps = [r["timestamp"] for r in subtask_rows]
    assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}"
    # Order preserved: the chronologically earlier span keeps the earlier
    # frame, the later one was bumped onto the next available frame.
    assert subtask_rows[0]["content"] == "grasp blue cube"
    assert subtask_rows[1]["content"] == "place blue cube in box"
    assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"]
 def test_plan_module_empty_when_all_off_vocab_after_retry(
    fixture_dataset_root: Path, tmp_path: Path
 ) -> None: