From 1fe1463ae0610b490b0cea9c28f777dcb90ceaab Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:13:50 +0200
Subject: [PATCH] annotate: enable subtask describe->segment->verify chain by
 default

Flip PlanConfig.subtask_describe_first and subtask_verify defaults
False -> True. Every subtask annotation now runs the 3-call grounding
+ pruning chain by default, since the single-call path reliably
hallucinates steps from the task text. Costs 2 extra VLM calls/episode;
disable with --plan.subtask_describe_first=false / --plan.subtask_
verify=false on easy datasets where fewer calls matter more than
label fidelity.

run_hf_job.py: drop the now-redundant explicit flags, leave a note that
the chain is default-on and how to opt out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py                 | 11 ++++-------
 .../annotations/steerable_pipeline/config.py       | 14 ++++++++++----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 421b263da..ade582861 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -82,13 +82,10 @@ CMD = (
     # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
     "--plan.plan_max_steps=6 "
-    # Multi-call quality chain (3 VLM calls/episode for subtasks):
-    #   1. describe-first: narrate ONLY what is visible before segmenting
-    #      — the strongest fix for subtasks invented from the task text.
-    #   2. (segment)
-    #   3. verify: re-watch and prune any subtask not actually seen.
-    "--plan.subtask_describe_first=true "
-    "--plan.subtask_verify=true "
+    # NOTE: the multi-call subtask quality chain (describe -> segment ->
+    # verify, 3 VLM calls/episode) is ON BY DEFAULT now. Pass
+    # --plan.subtask_describe_first=false / --plan.subtask_verify=false to
+    # disable on datasets you've verified are easy and want fewer calls.
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 9a0dd4232..18867f701 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -51,21 +51,27 @@ class PlanConfig:
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # Multi-call subtask quality chain (opt-in, more VLM calls, higher
-    # quality). Both off by default → single-call behaviour unchanged.
+    # Multi-call subtask quality chain. ON by default — the single-call
+    # 'watch video -> emit subtask JSON' pattern makes the VLM commit to
+    # structured output before reasoning about the video, so it
+    # pattern-matches the task text and hallucinates steps. The chain
+    # costs 2 extra VLM calls/episode (3 total for subtasks) but is the
+    # difference between trustworthy and fabricated labels. Set either to
+    # False to trade quality for fewer calls on datasets you've verified
+    # are easy.
     #
     # ``subtask_describe_first``: run a grounding pass that narrates ONLY
     # what is visible in the video (no subtask JSON yet), then inject that
     # description into the segmentation prompt. Forces the model to
     # observe before committing to structured output — the strongest
     # lever against subtasks invented from the task text. +1 VLM call/ep.
-    subtask_describe_first: bool = False
+    subtask_describe_first: bool = True
     # ``subtask_verify``: after segmentation, re-watch the video and drop
     # any proposed subtask that can't be verified as visible. Prunes
     # hallucinations; can only remove subtasks, never add/rewrite them.
     # Fail-open (keeps un-verified spans if the verify call returns
     # nothing). +1 VLM call/ep.
-    subtask_verify: bool = False
+    subtask_verify: bool = True
 
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4