From 1fe1463ae0610b490b0cea9c28f777dcb90ceaab Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 2 Jun 2026 15:13:50 +0200 Subject: [PATCH] annotate: enable subtask describe->segment->verify chain by default Flip PlanConfig.subtask_describe_first and subtask_verify defaults False -> True. Every subtask annotation now runs the 3-call grounding + pruning chain by default, since the single-call path reliably hallucinates steps from the task text. Costs 2 extra VLM calls/episode; disable with --plan.subtask_describe_first=false / --plan.subtask_ verify=false on easy datasets where fewer calls matter more than label fidelity. run_hf_job.py: drop the now-redundant explicit flags, leave a note that the chain is default-on and how to opt out. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/annotations/run_hf_job.py | 11 ++++------- .../annotations/steerable_pipeline/config.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 421b263da..ade582861 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -82,13 +82,10 @@ CMD = ( # tasks. Leave off for RoboCasa atomic / navigation. # Keep subtask decomposition tight for atomic tasks: "--plan.plan_max_steps=6 " - # Multi-call quality chain (3 VLM calls/episode for subtasks): - # 1. describe-first: narrate ONLY what is visible before segmenting - # — the strongest fix for subtasks invented from the task text. - # 2. (segment) - # 3. verify: re-watch and prune any subtask not actually seen. - "--plan.subtask_describe_first=true " - "--plan.subtask_verify=true " + # NOTE: the multi-call subtask quality chain (describe -> segment -> + # verify, 3 VLM calls/episode) is ON BY DEFAULT now. Pass + # --plan.subtask_describe_first=false / --plan.subtask_verify=false to + # disable on datasets you've verified are easy and want fewer calls. # Phase 2 — interjections + speech. "--interjections.max_interjections_per_episode=6 " # Phase 4 — general VQA. diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 9a0dd4232..18867f701 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -51,21 +51,27 @@ class PlanConfig: min_subtask_seconds: float = 1.5 plan_max_steps: int = 8 - # Multi-call subtask quality chain (opt-in, more VLM calls, higher - # quality). Both off by default → single-call behaviour unchanged. + # Multi-call subtask quality chain. ON by default — the single-call + # 'watch video -> emit subtask JSON' pattern makes the VLM commit to + # structured output before reasoning about the video, so it + # pattern-matches the task text and hallucinates steps. The chain + # costs 2 extra VLM calls/episode (3 total for subtasks) but is the + # difference between trustworthy and fabricated labels. Set either to + # False to trade quality for fewer calls on datasets you've verified + # are easy. # # ``subtask_describe_first``: run a grounding pass that narrates ONLY # what is visible in the video (no subtask JSON yet), then inject that # description into the segmentation prompt. Forces the model to # observe before committing to structured output — the strongest # lever against subtasks invented from the task text. +1 VLM call/ep. - subtask_describe_first: bool = False + subtask_describe_first: bool = True # ``subtask_verify``: after segmentation, re-watch the video and drop # any proposed subtask that can't be verified as visible. Prunes # hallucinations; can only remove subtasks, never add/rewrite them. # Fail-open (keeps un-verified spans if the verify call returns # nothing). +1 VLM call/ep. - subtask_verify: bool = False + subtask_verify: bool = True # When True (and backend supports it, e.g. ``openai``), the ``plan`` # module sends a ``video_url`` block pointing at a per-episode mp4