mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-23 20:50:02 +00:00
fix(annotate): never leave an episode with zero canonical subtasks
When the canonical vocabulary is enabled and the VLM produces spans
that don't overlap any canonical label, the previous Jaccard-floor
(0.5) dropped them and the episode came out with no subtasks at all
— invisible to the downstream policy. Observed on
``pepijn223/super_poulain_vocab``: some episodes had empty subtask
columns because every VLM-emitted phrase scored below 0.5 against
the discovered vocabulary.
Two-pass canonicalisation:
- First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to
let mild paraphrases through) and drops everything below.
- If that first pass leaves the episode with **zero** subtasks,
fall back to a second pass that always snaps each VLM span to
its nearest canonical label by Jaccard (no floor). The episode
ends up with subtasks even when the vocabulary missed a phase
— a slightly-wrong canonical label is still closer to the right
motion than nothing at all.
- Log loudly when the fallback fires so the operator can spot
coverage gaps in ``meta/canonical_vocabulary.json``.
- Log a per-episode count at INFO when some (but not all) spans
were dropped so it's visible without spamming the run output.
Promote the Jaccard floor + ignore-tokens to class constants so
they're a single edit point. Add ``force=True`` parameter to
``_canonicalize_subtask`` for the no-floor fallback path.
New test ``test_plan_module_snaps_when_all_off_vocab`` covers the
fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is
adjusted to keep at least one in-vocab span so the floor path can
still fire and is exercised. All 12 vocabulary tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -328,7 +328,7 @@ class PlanSubtasksMemoryModule:
|
|||||||
# clamp to [t0, t_last] and sort
|
# clamp to [t0, t_last] and sort
|
||||||
t0 = record.frame_timestamps[0]
|
t0 = record.frame_timestamps[0]
|
||||||
t_last = record.frame_timestamps[-1]
|
t_last = record.frame_timestamps[-1]
|
||||||
cleaned: list[dict[str, Any]] = []
|
raw: list[dict[str, Any]] = []
|
||||||
for span in spans:
|
for span in spans:
|
||||||
try:
|
try:
|
||||||
start = float(span["start"])
|
start = float(span["start"])
|
||||||
@@ -340,12 +340,45 @@ class PlanSubtasksMemoryModule:
|
|||||||
end = max(t0, min(end, t_last))
|
end = max(t0, min(end, t_last))
|
||||||
if end < start:
|
if end < start:
|
||||||
start, end = end, start
|
start, end = end, start
|
||||||
if not text:
|
if text:
|
||||||
continue
|
raw.append({"text": text, "start": start, "end": end})
|
||||||
text = self._canonicalize_subtask(text)
|
|
||||||
if not text:
|
# Without a vocabulary, free-form spans pass through unchanged.
|
||||||
continue
|
if self.vocabulary is None or not self.vocabulary.subtasks:
|
||||||
cleaned.append({"text": text, "start": start, "end": end})
|
raw.sort(key=lambda s: s["start"])
|
||||||
|
return raw
|
||||||
|
|
||||||
|
# With a vocabulary, snap each span to the closest canonical
|
||||||
|
# label. Two-pass: first try the normal Jaccard floor (drops
|
||||||
|
# off-topic hallucinations); if that leaves the episode with
|
||||||
|
# zero subtasks, fall back to snap-without-floor so the episode
|
||||||
|
# is never silently emptied — a wrong canonical label is still
|
||||||
|
# closer to the right phase than nothing at all.
|
||||||
|
cleaned: list[dict[str, Any]] = []
|
||||||
|
for span in raw:
|
||||||
|
mapped = self._canonicalize_subtask(span["text"])
|
||||||
|
if mapped:
|
||||||
|
cleaned.append({**span, "text": mapped})
|
||||||
|
if not cleaned and raw:
|
||||||
|
logger.warning(
|
||||||
|
"episode %d: every VLM subtask was off-vocabulary "
|
||||||
|
"(%d spans); snapping to closest canonical label anyway "
|
||||||
|
"(check meta/canonical_vocabulary.json for missing phases)",
|
||||||
|
record.episode_index,
|
||||||
|
len(raw),
|
||||||
|
)
|
||||||
|
for span in raw:
|
||||||
|
mapped = self._canonicalize_subtask(span["text"], force=True)
|
||||||
|
if mapped:
|
||||||
|
cleaned.append({**span, "text": mapped})
|
||||||
|
elif len(cleaned) < len(raw):
|
||||||
|
logger.info(
|
||||||
|
"episode %d: %d/%d subtasks survived canonicalisation; "
|
||||||
|
"the rest were off-vocabulary",
|
||||||
|
record.episode_index,
|
||||||
|
len(cleaned),
|
||||||
|
len(raw),
|
||||||
|
)
|
||||||
cleaned.sort(key=lambda s: s["start"])
|
cleaned.sort(key=lambda s: s["start"])
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
@@ -387,15 +420,28 @@ class PlanSubtasksMemoryModule:
|
|||||||
f"{bullets}\n\n"
|
f"{bullets}\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _canonicalize_subtask(self, text: str) -> str:
|
_CANONICALIZE_JACCARD_FLOOR: float = 0.25
|
||||||
"""Snap ``text`` to the closest canonical subtask string, or drop it.
|
_CANONICALIZE_IGNORE_TOKENS: frozenset[str] = frozenset(
|
||||||
|
{"the", "a", "an", "to", "into", "from", "of", "on", "over", "at"}
|
||||||
|
)
|
||||||
|
|
||||||
|
def _canonicalize_subtask(self, text: str, *, force: bool = False) -> str:
|
||||||
|
"""Snap ``text`` to the closest canonical subtask string.
|
||||||
|
|
||||||
Without a vocabulary, the original text passes through. With a
|
Without a vocabulary, the original text passes through. With a
|
||||||
vocabulary, an exact case-insensitive match wins; failing that,
|
vocabulary, an exact case-insensitive match wins; failing that,
|
||||||
the best Jaccard overlap on the word set is used as a tolerant
|
the best Jaccard overlap on the word set is used as a tolerant
|
||||||
fuzzy match (handles articles / minor reorderings). If nothing
|
fuzzy match (handles articles / minor reorderings).
|
||||||
clears the floor, the subtask is dropped — better to skip a
|
|
||||||
phase than to feed the action expert an off-distribution string.
|
Behaviour at the Jaccard floor depends on ``force``:
|
||||||
|
- ``force=False`` (default): below ``_CANONICALIZE_JACCARD_FLOOR``
|
||||||
|
the subtask is dropped. ``_generate_subtasks`` runs this first
|
||||||
|
to filter genuine off-topic hallucinations.
|
||||||
|
- ``force=True``: always snap, no floor. ``_generate_subtasks``
|
||||||
|
uses this in a second pass when the first pass would otherwise
|
||||||
|
empty the episode — a slightly-wrong canonical label is still
|
||||||
|
closer to the right phase than no subtask at all, which makes
|
||||||
|
the whole episode invisible to the downstream policy.
|
||||||
"""
|
"""
|
||||||
if self.vocabulary is None or not self.vocabulary.subtasks:
|
if self.vocabulary is None or not self.vocabulary.subtasks:
|
||||||
return text.strip()
|
return text.strip()
|
||||||
@@ -406,14 +452,17 @@ class PlanSubtasksMemoryModule:
|
|||||||
if candidate.lower() == lowered:
|
if candidate.lower() == lowered:
|
||||||
return candidate
|
return candidate
|
||||||
# Jaccard fallback: token-set overlap, drop articles + adverbs.
|
# Jaccard fallback: token-set overlap, drop articles + adverbs.
|
||||||
ignore = {"the", "a", "an", "to", "into", "from", "of", "on", "over", "at"}
|
words = {
|
||||||
words = {w for w in lowered.replace(",", " ").split() if w and w not in ignore}
|
w for w in lowered.replace(",", " ").split()
|
||||||
|
if w and w not in self._CANONICALIZE_IGNORE_TOKENS
|
||||||
|
}
|
||||||
if not words:
|
if not words:
|
||||||
return ""
|
return ""
|
||||||
best: tuple[float, str] | None = None
|
best: tuple[float, str] | None = None
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
cand_words = {
|
cand_words = {
|
||||||
w for w in candidate.lower().replace(",", " ").split() if w and w not in ignore
|
w for w in candidate.lower().replace(",", " ").split()
|
||||||
|
if w and w not in self._CANONICALIZE_IGNORE_TOKENS
|
||||||
}
|
}
|
||||||
if not cand_words:
|
if not cand_words:
|
||||||
continue
|
continue
|
||||||
@@ -422,14 +471,16 @@ class PlanSubtasksMemoryModule:
|
|||||||
score = inter / union if union else 0.0
|
score = inter / union if union else 0.0
|
||||||
if best is None or score > best[0]:
|
if best is None or score > best[0]:
|
||||||
best = (score, candidate)
|
best = (score, candidate)
|
||||||
# Floor: require at least ~half the tokens to overlap. Below that
|
if best is None:
|
||||||
# the VLM is hallucinating a novel phrase; drop rather than warp
|
return ""
|
||||||
# it into something semantically wrong.
|
if not force and best[0] < self._CANONICALIZE_JACCARD_FLOOR:
|
||||||
if best is None or best[0] < 0.5:
|
logger.info(
|
||||||
logger.warning(
|
"subtask %r dropped — best canonical match %r scored %.2f "
|
||||||
"subtask %r did not match any canonical label (best=%s) — dropping",
|
"(< %.2f Jaccard floor)",
|
||||||
cleaned,
|
cleaned,
|
||||||
best,
|
best[1],
|
||||||
|
best[0],
|
||||||
|
self._CANONICALIZE_JACCARD_FLOOR,
|
||||||
)
|
)
|
||||||
return ""
|
return ""
|
||||||
return best[1]
|
return best[1]
|
||||||
|
|||||||
@@ -217,13 +217,20 @@ def test_plan_module_canonicalizes_paraphrased_subtask(
|
|||||||
def test_plan_module_drops_off_vocab_subtask(
|
def test_plan_module_drops_off_vocab_subtask(
|
||||||
fixture_dataset_root: Path, tmp_path: Path
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
"""A subtask with low overlap to every canonical label is dropped."""
|
"""A subtask with low overlap to every canonical label is dropped.
|
||||||
|
|
||||||
|
Drop only kicks in when *at least one* other subtask survives — if
|
||||||
|
every span would be dropped the episode would come out empty, so
|
||||||
|
``_generate_subtasks`` falls back to snap-without-floor; that path
|
||||||
|
is exercised by ``test_plan_module_snaps_when_all_off_vocab``.
|
||||||
|
"""
|
||||||
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
||||||
|
|
||||||
def responder(_messages):
|
def responder(_messages):
|
||||||
return {
|
return {
|
||||||
"subtasks": [
|
"subtasks": [
|
||||||
# in-vocab
|
# in-vocab — keeps the episode non-empty so the floor
|
||||||
|
# is allowed to drop the next span.
|
||||||
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
|
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
|
||||||
# off-vocab hallucination — no token overlap above the
|
# off-vocab hallucination — no token overlap above the
|
||||||
# Jaccard floor; should be dropped.
|
# Jaccard floor; should be dropped.
|
||||||
@@ -246,6 +253,43 @@ def test_plan_module_drops_off_vocab_subtask(
|
|||||||
assert subtask_texts == ["grasp blue cube"]
|
assert subtask_texts == ["grasp blue cube"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_plan_module_snaps_when_all_off_vocab(
|
||||||
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
|
) -> None:
|
||||||
|
"""All-off-vocab spans snap to nearest canonical instead of emptying the episode."""
|
||||||
|
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
||||||
|
|
||||||
|
def responder(_messages):
|
||||||
|
return {
|
||||||
|
"subtasks": [
|
||||||
|
# Both off-vocab — would normally be dropped. The
|
||||||
|
# fallback should snap each to its best canonical match
|
||||||
|
# rather than leave the episode with no subtasks at all.
|
||||||
|
{"text": "make a smoothie", "start": 0.0, "end": 0.4},
|
||||||
|
{"text": "consult the wizard", "start": 0.4, "end": 0.9},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
vlm = StubVlmClient(responder=responder)
|
||||||
|
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
|
||||||
|
module = PlanSubtasksMemoryModule(
|
||||||
|
vlm=vlm,
|
||||||
|
config=PlanConfig(n_task_rephrasings=0),
|
||||||
|
vocabulary=vocab,
|
||||||
|
)
|
||||||
|
record = next(iter_episodes(fixture_dataset_root))
|
||||||
|
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||||
|
module.run_episode(record, staging)
|
||||||
|
rows = staging.read("plan")
|
||||||
|
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
|
||||||
|
# Two off-vocab spans → two canonical subtasks (snapped to nearest
|
||||||
|
# by Jaccard with no floor). The exact canonical choice doesn't
|
||||||
|
# matter — only that the episode came out with subtasks rather
|
||||||
|
# than empty.
|
||||||
|
assert len(subtask_texts) == 2
|
||||||
|
assert all(s in _CANONICAL_SUBTASKS for s in subtask_texts)
|
||||||
|
|
||||||
|
|
||||||
def test_plan_module_without_vocab_passes_through(
|
def test_plan_module_without_vocab_passes_through(
|
||||||
fixture_dataset_root: Path, tmp_path: Path
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user