diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 421b263da..ade582861 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -82,13 +82,10 @@ CMD = ( # tasks. Leave off for RoboCasa atomic / navigation. # Keep subtask decomposition tight for atomic tasks: "--plan.plan_max_steps=6 " - # Multi-call quality chain (3 VLM calls/episode for subtasks): - # 1. describe-first: narrate ONLY what is visible before segmenting - # — the strongest fix for subtasks invented from the task text. - # 2. (segment) - # 3. verify: re-watch and prune any subtask not actually seen. - "--plan.subtask_describe_first=true " - "--plan.subtask_verify=true " + # NOTE: the multi-call subtask quality chain (describe -> segment -> + # verify, 3 VLM calls/episode) is ON BY DEFAULT now. Pass + # --plan.subtask_describe_first=false / --plan.subtask_verify=false to + # disable on datasets you've verified are easy and want fewer calls. # Phase 2 — interjections + speech. "--interjections.max_interjections_per_episode=6 " # Phase 4 — general VQA. diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 9a0dd4232..18867f701 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -51,21 +51,27 @@ class PlanConfig: min_subtask_seconds: float = 1.5 plan_max_steps: int = 8 - # Multi-call subtask quality chain (opt-in, more VLM calls, higher - # quality). Both off by default → single-call behaviour unchanged. + # Multi-call subtask quality chain. ON by default — the single-call + # 'watch video -> emit subtask JSON' pattern makes the VLM commit to + # structured output before reasoning about the video, so it + # pattern-matches the task text and hallucinates steps. The chain + # costs 2 extra VLM calls/episode (3 total for subtasks) but is the + # difference between trustworthy and fabricated labels. Set either to + # False to trade quality for fewer calls on datasets you've verified + # are easy. # # ``subtask_describe_first``: run a grounding pass that narrates ONLY # what is visible in the video (no subtask JSON yet), then inject that # description into the segmentation prompt. Forces the model to # observe before committing to structured output — the strongest # lever against subtasks invented from the task text. +1 VLM call/ep. - subtask_describe_first: bool = False + subtask_describe_first: bool = True # ``subtask_verify``: after segmentation, re-watch the video and drop # any proposed subtask that can't be verified as visible. Prunes # hallucinations; can only remove subtasks, never add/rewrite them. # Fail-open (keeps un-verified spans if the verify call returns # nothing). +1 VLM call/ep. - subtask_verify: bool = False + subtask_verify: bool = True # When True (and backend supports it, e.g. ``openai``), the ``plan`` # module sends a ``video_url`` block pointing at a per-episode mp4