fix(annotate): never leave an episode with zero canonical subtasks

When the canonical vocabulary is enabled and the VLM produces spans
that don't overlap any canonical label, the previous Jaccard-floor
(0.5) dropped them and the episode came out with no subtasks at all
— invisible to the downstream policy. Observed on
``pepijn223/super_poulain_vocab``: some episodes had empty subtask
columns because every VLM-emitted phrase scored below 0.5 against
the discovered vocabulary.

Two-pass canonicalisation:

  - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to
    let mild paraphrases through) and drops everything below.
  - If that first pass leaves the episode with **zero** subtasks,
    fall back to a second pass that always snaps each VLM span to
    its nearest canonical label by Jaccard (no floor). The episode
    ends up with subtasks even when the vocabulary missed a phase
    — a slightly-wrong canonical label is still closer to the right
    motion than nothing at all.
  - Log loudly when the fallback fires so the operator can spot
    coverage gaps in ``meta/canonical_vocabulary.json``.
  - Log a per-episode count at INFO when some (but not all) spans
    were dropped so it's visible without spamming the run output.

Promote the Jaccard floor + ignore-tokens to class constants so
they're a single edit point. Add ``force=True`` parameter to
``_canonicalize_subtask`` for the no-floor fallback path.

New test ``test_plan_module_snaps_when_all_off_vocab`` covers the
fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is
adjusted to keep at least one in-vocab span so the floor path can
still fire and is exercised. All 12 vocabulary tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
pepijn
2026-05-22 12:44:03 +00:00
parent 54221ceea2
commit 336af85c09
2 changed files with 119 additions and 24 deletions
@@ -328,7 +328,7 @@ class PlanSubtasksMemoryModule:
# clamp to [t0, t_last] and sort
t0 = record.frame_timestamps[0]
t_last = record.frame_timestamps[-1]
cleaned: list[dict[str, Any]] = []
raw: list[dict[str, Any]] = []
for span in spans:
try:
start = float(span["start"])
@@ -340,12 +340,45 @@ class PlanSubtasksMemoryModule:
end = max(t0, min(end, t_last))
if end < start:
start, end = end, start
if not text:
continue
text = self._canonicalize_subtask(text)
if not text:
continue
cleaned.append({"text": text, "start": start, "end": end})
if text:
raw.append({"text": text, "start": start, "end": end})
# Without a vocabulary, free-form spans pass through unchanged.
if self.vocabulary is None or not self.vocabulary.subtasks:
raw.sort(key=lambda s: s["start"])
return raw
# With a vocabulary, snap each span to the closest canonical
# label. Two-pass: first try the normal Jaccard floor (drops
# off-topic hallucinations); if that leaves the episode with
# zero subtasks, fall back to snap-without-floor so the episode
# is never silently emptied — a wrong canonical label is still
# closer to the right phase than nothing at all.
cleaned: list[dict[str, Any]] = []
for span in raw:
mapped = self._canonicalize_subtask(span["text"])
if mapped:
cleaned.append({**span, "text": mapped})
if not cleaned and raw:
logger.warning(
"episode %d: every VLM subtask was off-vocabulary "
"(%d spans); snapping to closest canonical label anyway "
"(check meta/canonical_vocabulary.json for missing phases)",
record.episode_index,
len(raw),
)
for span in raw:
mapped = self._canonicalize_subtask(span["text"], force=True)
if mapped:
cleaned.append({**span, "text": mapped})
elif len(cleaned) < len(raw):
logger.info(
"episode %d: %d/%d subtasks survived canonicalisation; "
"the rest were off-vocabulary",
record.episode_index,
len(cleaned),
len(raw),
)
cleaned.sort(key=lambda s: s["start"])
return cleaned
@@ -387,15 +420,28 @@ class PlanSubtasksMemoryModule:
f"{bullets}\n\n"
)
def _canonicalize_subtask(self, text: str) -> str:
"""Snap ``text`` to the closest canonical subtask string, or drop it.
_CANONICALIZE_JACCARD_FLOOR: float = 0.25
_CANONICALIZE_IGNORE_TOKENS: frozenset[str] = frozenset(
{"the", "a", "an", "to", "into", "from", "of", "on", "over", "at"}
)
def _canonicalize_subtask(self, text: str, *, force: bool = False) -> str:
"""Snap ``text`` to the closest canonical subtask string.
Without a vocabulary, the original text passes through. With a
vocabulary, an exact case-insensitive match wins; failing that,
the best Jaccard overlap on the word set is used as a tolerant
fuzzy match (handles articles / minor reorderings). If nothing
clears the floor, the subtask is dropped — better to skip a
phase than to feed the action expert an off-distribution string.
fuzzy match (handles articles / minor reorderings).
Behaviour at the Jaccard floor depends on ``force``:
- ``force=False`` (default): below ``_CANONICALIZE_JACCARD_FLOOR``
the subtask is dropped. ``_generate_subtasks`` runs this first
to filter genuine off-topic hallucinations.
- ``force=True``: always snap, no floor. ``_generate_subtasks``
uses this in a second pass when the first pass would otherwise
empty the episode — a slightly-wrong canonical label is still
closer to the right phase than no subtask at all, which makes
the whole episode invisible to the downstream policy.
"""
if self.vocabulary is None or not self.vocabulary.subtasks:
return text.strip()
@@ -406,14 +452,17 @@ class PlanSubtasksMemoryModule:
if candidate.lower() == lowered:
return candidate
# Jaccard fallback: token-set overlap, drop articles + adverbs.
ignore = {"the", "a", "an", "to", "into", "from", "of", "on", "over", "at"}
words = {w for w in lowered.replace(",", " ").split() if w and w not in ignore}
words = {
w for w in lowered.replace(",", " ").split()
if w and w not in self._CANONICALIZE_IGNORE_TOKENS
}
if not words:
return ""
best: tuple[float, str] | None = None
for candidate in candidates:
cand_words = {
w for w in candidate.lower().replace(",", " ").split() if w and w not in ignore
w for w in candidate.lower().replace(",", " ").split()
if w and w not in self._CANONICALIZE_IGNORE_TOKENS
}
if not cand_words:
continue
@@ -422,14 +471,16 @@ class PlanSubtasksMemoryModule:
score = inter / union if union else 0.0
if best is None or score > best[0]:
best = (score, candidate)
# Floor: require at least ~half the tokens to overlap. Below that
# the VLM is hallucinating a novel phrase; drop rather than warp
# it into something semantically wrong.
if best is None or best[0] < 0.5:
logger.warning(
"subtask %r did not match any canonical label (best=%s) — dropping",
if best is None:
return ""
if not force and best[0] < self._CANONICALIZE_JACCARD_FLOOR:
logger.info(
"subtask %r dropped — best canonical match %r scored %.2f "
"(< %.2f Jaccard floor)",
cleaned,
best,
best[1],
best[0],
self._CANONICALIZE_JACCARD_FLOOR,
)
return ""
return best[1]
+46 -2
View File
@@ -217,13 +217,20 @@ def test_plan_module_canonicalizes_paraphrased_subtask(
def test_plan_module_drops_off_vocab_subtask(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
"""A subtask with low overlap to every canonical label is dropped."""
"""A subtask with low overlap to every canonical label is dropped.
Drop only kicks in when *at least one* other subtask survives — if
every span would be dropped the episode would come out empty, so
``_generate_subtasks`` falls back to snap-without-floor; that path
is exercised by ``test_plan_module_snaps_when_all_off_vocab``.
"""
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
return {
"subtasks": [
# in-vocab
# in-vocab — keeps the episode non-empty so the floor
# is allowed to drop the next span.
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
# off-vocab hallucination — no token overlap above the
# Jaccard floor; should be dropped.
@@ -246,6 +253,43 @@ def test_plan_module_drops_off_vocab_subtask(
assert subtask_texts == ["grasp blue cube"]
def test_plan_module_snaps_when_all_off_vocab(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
"""All-off-vocab spans snap to nearest canonical instead of emptying the episode."""
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
return {
"subtasks": [
# Both off-vocab — would normally be dropped. The
# fallback should snap each to its best canonical match
# rather than leave the episode with no subtasks at all.
{"text": "make a smoothie", "start": 0.0, "end": 0.4},
{"text": "consult the wizard", "start": 0.4, "end": 0.9},
]
}
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
# Two off-vocab spans → two canonical subtasks (snapped to nearest
# by Jaccard with no floor). The exact canonical choice doesn't
# matter — only that the episode came out with subtasks rather
# than empty.
assert len(subtask_texts) == 2
assert all(s in _CANONICAL_SUBTASKS for s in subtask_texts)
def test_plan_module_without_vocab_passes_through(
fixture_dataset_root: Path, tmp_path: Path
) -> None: