From 54221ceea245dbc0769155d77d7142242afe842f Mon Sep 17 00:00:00 2001 From: pepijn Date: Fri, 22 May 2026 11:46:31 +0000 Subject: [PATCH] feat(annotate): let the VLM decide vocabulary size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hardcoding ``n_subtask_target=10`` and ``n_memory_target=6`` baked task complexity into the config — a simple pick-and-place needs ~6, a multi-step recipe needs ~20. The VLM already sees the clips, so let it pick the count itself from what's recurring across episodes. Drop both knobs from ``VocabularyConfig`` and the ``module_0_vocabulary`` prompt template. The prompt now says "decide the count yourself based on what you see — the smallest set that still covers every recurring phase" and adds an "each label must recur across the demos" rule so the VLM filters out one-off motions. Update the launcher script + docs to remove the old knobs. Co-Authored-By: Claude Opus 4.7 (1M context) Co-authored-by: Cursor --- docs/source/annotation_pipeline.mdx | 21 ++++++++++--------- examples/annotations/run_hf_job.py | 9 ++++---- .../annotations/steerable_pipeline/config.py | 15 +++++++------ .../prompts/module_0_vocabulary.txt | 11 ++++++++-- .../steerable_pipeline/vocabulary.py | 2 -- 5 files changed, 31 insertions(+), 27 deletions(-) diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx index 6e4cd9563..9d6e66231 100644 --- a/docs/source/annotation_pipeline.mdx +++ b/docs/source/annotation_pipeline.mdx @@ -24,16 +24,17 @@ rewrites the data shards in place: The `plan` module is constrained to a **canonical vocabulary** discovered once per dataset by the `vocabulary` module (phase 0). It watches a few sample episode videos (`--vocabulary.sample_episodes`, default `3`) and -asks the VLM to derive a small set of imperative subtask labels -(~`--vocabulary.n_subtask_target`, default `10`) and first-person memory -milestones (~`--vocabulary.n_memory_target`, default `6`) that recur -across the demos. The result lands at -`meta/canonical_vocabulary.json` (human-readable / hand-editable) and is -reused on every subsequent run. The `plan` module then constrains both -subtask + memory generation to those exact strings — the downstream -low-level policy sees a small, repeatable target distribution instead of -thousands of LLM paraphrases. Disable with `--vocabulary.enabled=False` -to fall back to free-form generation. +asks the VLM to derive a small set of imperative subtask labels and +first-person memory milestones that recur across the demos. The VLM +picks the right number of entries itself based on what it sees in the +clips — short pick-and-place demos get ~6 subtask labels, longer +multi-step recipes get more. The result lands at +`meta/canonical_vocabulary.json` (human-readable / hand-editable) and +is reused on every subsequent run. The `plan` module then constrains +both subtask + memory generation to those exact strings — the +downstream low-level policy sees a small, repeatable target +distribution instead of thousands of LLM paraphrases. Disable with +`--vocabulary.enabled=False` to fall back to free-form generation. The writer does **not** add a `tools` column to the parquet — the tool catalog lives at `meta/info.json["tools"]` instead (see diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 905811d14..f3e497039 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -55,12 +55,11 @@ CMD = ( "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.wrist " # Phase 0 — canonical vocabulary discovery from the first N sample - # episodes. The resulting meta/canonical_vocabulary.json constrains - # every subtask + memory string to a small repeatable target - # distribution; tune the counts for your task complexity. + # episodes. The VLM picks the right number of subtask + memory + # entries itself from what it sees; the resulting + # meta/canonical_vocabulary.json constrains every subtask + memory + # string to a small repeatable target distribution. "--vocabulary.sample_episodes=3 " - "--vocabulary.n_subtask_target=10 " - "--vocabulary.n_memory_target=6 " # Phase 1 — plan module (subtasks + plan + memory + task_aug). "--plan.frames_per_second=1.0 " "--plan.use_video_url=true " diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index d80e4149a..da07d7998 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -26,12 +26,13 @@ class VocabularyConfig: """Phase 0 — dataset-level canonical vocabulary discovery. Watches the first ``sample_episodes`` episode videos and asks the VLM - to derive a small canonical vocabulary (~``n_subtask_target`` subtask - labels + ~``n_memory_target`` memory milestones) that every episode - in the dataset will reuse. The output lands at - ``meta/canonical_vocabulary.json`` and feeds phase 1's subtask + - memory generation as both a prompt-side constraint and a post-VLM - validation gate. + to derive a small canonical vocabulary (subtask labels + memory + milestones) that every episode in the dataset will reuse. The VLM + decides the count itself from what it sees in the clips — short + pick-and-place demos get ~6 labels, longer multi-step recipes more. + The output lands at ``meta/canonical_vocabulary.json`` and feeds + phase 1's subtask + memory generation as both a prompt-side + constraint and a post-VLM validation gate. Why this exists: free-form LLM rephrasing per episode produces near- unique subtask strings, which makes the downstream low-level policy's @@ -48,8 +49,6 @@ class VocabularyConfig: enabled: bool = True sample_episodes: int = 3 - n_subtask_target: int = 10 - n_memory_target: int = 6 max_video_frames_per_episode: int = 32 # When True (default), an existing meta/canonical_vocabulary.json is # loaded as-is and no VLM call is made — lets operators hand-edit the diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt index f867c34b5..00c29be4e 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt @@ -8,6 +8,13 @@ conditioned on these strings — duplicate phrasings (e.g. "grasp blue cube" vs "pick up the blue cube") would destroy the conditioning, so pick one wording per concept and reuse it everywhere. +Decide how many entries each list needs YOURSELF based on what you see — +the smallest set that still covers every recurring phase in the demos. +A simple two-object pick-and-place might need ~6 subtask labels and 2 +memory milestones; a long multi-step recipe needs more. Err on the side +of FEWER — extra entries that don't recur across episodes weaken the +conditioning. + You output two lists: 1. `subtasks`: imperative, telegraphic commands the robot can execute. @@ -16,7 +23,8 @@ You output two lists: "cube" — never "block" / "object"). - Atomic — one skill per subtask (gripper-open events, contact, regrasps, transitions all become cut points). - - Aim for ~{n_subtask_target} labels. Fewer is better than more. + - Each label must recur across the demos. If you see a motion only + once across all sample clips, it probably isn't a canonical phase. - Good: "move to blue cube", "grasp blue cube", "lift blue cube", "place blue cube in box", "release blue cube", "retract arm". - Bad: "the robot arm moves towards the blue cube" (third person, @@ -30,7 +38,6 @@ You output two lists: should NOT. - First person, past tense. Start with "I". - One sentence. Functional outcome only — no grasp / motion detail. - - Aim for ~{n_memory_target} milestones. - Good: "I picked up the blue cube.", "I placed the blue cube in the green box.", "I wiped the counter." - Bad: "The robot arm grasped the blue cube." (third person), diff --git a/src/lerobot/annotations/steerable_pipeline/vocabulary.py b/src/lerobot/annotations/steerable_pipeline/vocabulary.py index 8787ec372..121cef849 100644 --- a/src/lerobot/annotations/steerable_pipeline/vocabulary.py +++ b/src/lerobot/annotations/steerable_pipeline/vocabulary.py @@ -190,8 +190,6 @@ class VocabularyDiscoveryModule: prompt = load_prompt("module_0_vocabulary").format( episode_task=task_hint or "(unspecified)", n_episodes=len(sample), - n_subtask_target=int(self.config.n_subtask_target), - n_memory_target=int(self.config.n_memory_target), ) # Pack one video block per sample episode so the VLM sees the # variation across episodes (different starting poses, different