mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 05:59:52 +00:00
fix(annotate): never leave an episode with zero canonical subtasks
When the canonical vocabulary is enabled and the VLM produces spans
that don't overlap any canonical label, the previous Jaccard-floor
(0.5) dropped them and the episode came out with no subtasks at all
— invisible to the downstream policy. Observed on
``pepijn223/super_poulain_vocab``: some episodes had empty subtask
columns because every VLM-emitted phrase scored below 0.5 against
the discovered vocabulary.
Two-pass canonicalisation:
- First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to
let mild paraphrases through) and drops everything below.
- If that first pass leaves the episode with **zero** subtasks,
fall back to a second pass that always snaps each VLM span to
its nearest canonical label by Jaccard (no floor). The episode
ends up with subtasks even when the vocabulary missed a phase
— a slightly-wrong canonical label is still closer to the right
motion than nothing at all.
- Log loudly when the fallback fires so the operator can spot
coverage gaps in ``meta/canonical_vocabulary.json``.
- Log a per-episode count at INFO when some (but not all) spans
were dropped so it's visible without spamming the run output.
Promote the Jaccard floor + ignore-tokens to class constants so
they're a single edit point. Add ``force=True`` parameter to
``_canonicalize_subtask`` for the no-floor fallback path.
New test ``test_plan_module_snaps_when_all_off_vocab`` covers the
fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is
adjusted to keep at least one in-vocab span so the floor path can
still fire and is exercised. All 12 vocabulary tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -217,13 +217,20 @@ def test_plan_module_canonicalizes_paraphrased_subtask(
|
||||
def test_plan_module_drops_off_vocab_subtask(
|
||||
fixture_dataset_root: Path, tmp_path: Path
|
||||
) -> None:
|
||||
"""A subtask with low overlap to every canonical label is dropped."""
|
||||
"""A subtask with low overlap to every canonical label is dropped.
|
||||
|
||||
Drop only kicks in when *at least one* other subtask survives — if
|
||||
every span would be dropped the episode would come out empty, so
|
||||
``_generate_subtasks`` falls back to snap-without-floor; that path
|
||||
is exercised by ``test_plan_module_snaps_when_all_off_vocab``.
|
||||
"""
|
||||
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
||||
|
||||
def responder(_messages):
|
||||
return {
|
||||
"subtasks": [
|
||||
# in-vocab
|
||||
# in-vocab — keeps the episode non-empty so the floor
|
||||
# is allowed to drop the next span.
|
||||
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
|
||||
# off-vocab hallucination — no token overlap above the
|
||||
# Jaccard floor; should be dropped.
|
||||
@@ -246,6 +253,43 @@ def test_plan_module_drops_off_vocab_subtask(
|
||||
assert subtask_texts == ["grasp blue cube"]
|
||||
|
||||
|
||||
def test_plan_module_snaps_when_all_off_vocab(
|
||||
fixture_dataset_root: Path, tmp_path: Path
|
||||
) -> None:
|
||||
"""All-off-vocab spans snap to nearest canonical instead of emptying the episode."""
|
||||
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
|
||||
|
||||
def responder(_messages):
|
||||
return {
|
||||
"subtasks": [
|
||||
# Both off-vocab — would normally be dropped. The
|
||||
# fallback should snap each to its best canonical match
|
||||
# rather than leave the episode with no subtasks at all.
|
||||
{"text": "make a smoothie", "start": 0.0, "end": 0.4},
|
||||
{"text": "consult the wizard", "start": 0.4, "end": 0.9},
|
||||
]
|
||||
}
|
||||
|
||||
vlm = StubVlmClient(responder=responder)
|
||||
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
|
||||
module = PlanSubtasksMemoryModule(
|
||||
vlm=vlm,
|
||||
config=PlanConfig(n_task_rephrasings=0),
|
||||
vocabulary=vocab,
|
||||
)
|
||||
record = next(iter_episodes(fixture_dataset_root))
|
||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||
module.run_episode(record, staging)
|
||||
rows = staging.read("plan")
|
||||
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
|
||||
# Two off-vocab spans → two canonical subtasks (snapped to nearest
|
||||
# by Jaccard with no floor). The exact canonical choice doesn't
|
||||
# matter — only that the episode came out with subtasks rather
|
||||
# than empty.
|
||||
assert len(subtask_texts) == 2
|
||||
assert all(s in _CANONICAL_SUBTASKS for s in subtask_texts)
|
||||
|
||||
|
||||
def test_plan_module_without_vocab_passes_through(
|
||||
fixture_dataset_root: Path, tmp_path: Path
|
||||
) -> None:
|
||||
|
||||
Reference in New Issue
Block a user