mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 05:59:52 +00:00
fix(annotate): bump same-frame subtasks onto distinct frames
If two consecutive VLM-emitted subtask spans have ``start`` timestamps
that round to the same source frame after ``snap_to_frame`` (e.g. on
short episodes the VLM sometimes nominates two ~adjacent action
boundaries within one 30 Hz step), the writer emits two
``style=subtask`` rows at the identical persistent timestamp. The
training-time renderer's default binding
``subtask: active_at(t, style=subtask)`` then raises:
ValueError: Ambiguous resolver for style='subtask';
add role=..., tool_name=..., or camera=... to disambiguate.
… and the whole training run dies on the first batch.
Observed concretely on ``pepijn223/super_poulain_vocab2`` (job
22159979): episodes 3 and 30 each had two subtask rows at the same
timestamp (``release yellow cube`` + ``retract arm`` snapping to the
same frame).
Add ``_dedupe_starts_to_distinct_frames`` to walk the cleaned span list
and, whenever a snapped start collides with one already used, push the
later span onto the next free frame timestamp. Both subtasks survive
on distinct timestamps; the renderer can now disambiguate. If the
episode genuinely has no later free frame (extremely unlikely — would
require a same-timestamp collision on the very last frame of the
episode), the later span is dropped with a warning rather than left
to poison the render.
New test ``test_plan_module_bumps_collocated_subtasks_to_distinct_frames``
locks in the contract; full vocabulary suite is 14/14 green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -366,6 +366,7 @@ class PlanSubtasksMemoryModule:
|
|||||||
continue
|
continue
|
||||||
cleaned.append({"text": text, "start": start, "end": end})
|
cleaned.append({"text": text, "start": start, "end": end})
|
||||||
cleaned.sort(key=lambda s: s["start"])
|
cleaned.sort(key=lambda s: s["start"])
|
||||||
|
cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
|
||||||
if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
|
if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"episode %d: every VLM subtask was off-vocab even after retry — "
|
"episode %d: every VLM subtask was off-vocab even after retry — "
|
||||||
@@ -375,6 +376,54 @@ class PlanSubtasksMemoryModule:
|
|||||||
)
|
)
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _dedupe_starts_to_distinct_frames(
|
||||||
|
spans: list[dict[str, Any]], record: EpisodeRecord
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Bump same-frame subtask starts onto distinct frames.
|
||||||
|
|
||||||
|
Two consecutive VLM spans whose ``start`` rounds to the same
|
||||||
|
source frame (after :func:`snap_to_frame`) would otherwise emit
|
||||||
|
two ``style=subtask`` rows at the identical persistent
|
||||||
|
timestamp. The training-time renderer's ``active_at(t,
|
||||||
|
style=subtask)`` resolver can't disambiguate that and raises
|
||||||
|
``Ambiguous resolver for style='subtask'``.
|
||||||
|
|
||||||
|
Walk the (sorted-by-start) spans, snap each to its frame, and
|
||||||
|
if the snapped frame is already taken push the span onto the
|
||||||
|
next unused frame so both subtasks survive on distinct
|
||||||
|
timestamps. If the episode ends before a free frame is found,
|
||||||
|
the trailing span is dropped with a warning — better than
|
||||||
|
poisoning the render.
|
||||||
|
"""
|
||||||
|
if not spans:
|
||||||
|
return spans
|
||||||
|
frames = record.frame_timestamps
|
||||||
|
if not frames:
|
||||||
|
return spans
|
||||||
|
used: set[float] = set()
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
for span in spans:
|
||||||
|
ts = snap_to_frame(span["start"], frames)
|
||||||
|
if ts in used:
|
||||||
|
next_ts = next((f for f in frames if f > ts and f not in used), None)
|
||||||
|
if next_ts is None:
|
||||||
|
logger.warning(
|
||||||
|
"episode %d: subtask %r snapped to occupied frame "
|
||||||
|
"%.3f and no free later frame exists — dropping",
|
||||||
|
record.episode_index,
|
||||||
|
span.get("text"),
|
||||||
|
ts,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
ts = next_ts
|
||||||
|
used.add(ts)
|
||||||
|
new_span = {**span, "start": ts}
|
||||||
|
if float(new_span.get("end", ts)) < ts:
|
||||||
|
new_span["end"] = ts
|
||||||
|
out.append(new_span)
|
||||||
|
return out
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Canonical-vocabulary helpers
|
# Canonical-vocabulary helpers
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|||||||
@@ -309,6 +309,51 @@ def test_plan_module_drops_off_vocab_subtask_after_retry(
|
|||||||
assert subtask_texts == ["grasp blue cube"]
|
assert subtask_texts == ["grasp blue cube"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_plan_module_bumps_collocated_subtasks_to_distinct_frames(
|
||||||
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
|
) -> None:
|
||||||
|
"""Two subtasks whose starts snap to the same frame get split onto two frames.
|
||||||
|
|
||||||
|
Without this guard, both spans would emit ``style=subtask`` rows at the
|
||||||
|
identical persistent timestamp; the training-time renderer's
|
||||||
|
``active_at(t, style=subtask)`` then raises an ambiguity error.
|
||||||
|
"""
|
||||||
|
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
||||||
|
|
||||||
|
def responder(_messages):
|
||||||
|
# Two canonical labels with starts within one frame of each other —
|
||||||
|
# both snap to the same source frame, so the dedupe pass must bump
|
||||||
|
# the later one to the next frame.
|
||||||
|
return {
|
||||||
|
"subtasks": [
|
||||||
|
{"text": "grasp blue cube", "start": 0.40, "end": 0.42},
|
||||||
|
{"text": "place blue cube in box", "start": 0.41, "end": 0.50},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
vlm = StubVlmClient(responder=responder)
|
||||||
|
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
|
||||||
|
module = PlanSubtasksMemoryModule(
|
||||||
|
vlm=vlm,
|
||||||
|
config=PlanConfig(n_task_rephrasings=0),
|
||||||
|
vocabulary=vocab,
|
||||||
|
)
|
||||||
|
record = next(iter_episodes(fixture_dataset_root))
|
||||||
|
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||||
|
module.run_episode(record, staging)
|
||||||
|
rows = staging.read("plan")
|
||||||
|
subtask_rows = [r for r in rows if r["style"] == "subtask"]
|
||||||
|
# Both subtasks present, both on distinct timestamps.
|
||||||
|
assert len(subtask_rows) == 2
|
||||||
|
timestamps = [r["timestamp"] for r in subtask_rows]
|
||||||
|
assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}"
|
||||||
|
# Order preserved: the chronologically earlier span keeps the earlier
|
||||||
|
# frame, the later one was bumped onto the next available frame.
|
||||||
|
assert subtask_rows[0]["content"] == "grasp blue cube"
|
||||||
|
assert subtask_rows[1]["content"] == "place blue cube in box"
|
||||||
|
assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"]
|
||||||
|
|
||||||
|
|
||||||
def test_plan_module_empty_when_all_off_vocab_after_retry(
|
def test_plan_module_empty_when_all_off_vocab_after_retry(
|
||||||
fixture_dataset_root: Path, tmp_path: Path
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user