diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 01ef58f4d..f669593e9 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -53,13 +53,30 @@ CMD = ( "--executor.episode_parallelism=16 " "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.robot0_agentview_right " - # Phase 1 — plan module (subtasks + plan + memory + task_aug). + # Phase 1 — plan module (subtasks + plan + memory). "--plan.frames_per_second=1.0 " "--plan.use_video_url=true " "--plan.use_video_url_fps=1.0 " - "--plan.derive_task_from_video=always " - "--plan.task_aug_axes.enabled=true " - "--plan.action_records.enabled=true " + # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the + # stove", "Pick the mug...") is authoritative and is what eval uses. + # ``derive_task_from_video=off`` keeps that canonical task driving + # subtask generation. Do NOT use ``always`` here — it throws the real + # task away, asks the VLM "what is this video about?" with no hint, + # and the hallucinated task then poisons every subtask + plan row. + "--plan.derive_task_from_video=off " + # NO task augmentation for RoboCasa: eval conditions on the exact task + # strings, so synthetic rephrasings are unused at best and (when they + # drift, e.g. "wander around the kitchen") harmful. 0 rephrasings + + # axes disabled = the policy only ever sees the canonical task. + "--plan.n_task_rephrasings=0 " + # action_records OFF: the structured {verb,object,arm,grasp,dest} + # schema is a manipulation schema; RoboCasa navigation / atomic tasks + # don't fit it and the VLM hallucinates (e.g. "move stove to stove"). + # Leave off unless annotating long composite manipulation tasks you've + # verified render cleanly (and even then replace_subtask_text stays + # off by default so records are additive, never overwriting subtasks). + # Keep subtask decomposition tight for atomic tasks: + "--plan.plan_max_steps=6 " # Phase 2 — interjections + speech. "--interjections.max_interjections_per_episode=6 " # Phase 4 — general VQA. diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index c60e58fee..a3c1306be 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -94,12 +94,18 @@ class ActionRecordsConfig: A deterministic Python template then renders the record back to canonical subtask text (e.g. ``pick blue cube with left arm using - pinch grip``). When ``replace_subtask_text=True`` (default), the - rendered text REPLACES the VLM's free-form subtask text — eliminating - cross-episode phrasing drift. When ``emit_record_row=True`` - (default), the structured record is also emitted as a row with - ``style="action_record"`` so downstream consumers can train on the - typed schema directly. + pinch grip``). When ``replace_subtask_text=True``, the rendered text + REPLACES the VLM's free-form subtask text. This is OFF by default: + the structured fields are easy for the VLM to hallucinate on tasks + that don't fit the manipulation schema (e.g. navigation tasks yield + nonsense like ``move stove to stove``), and silently overwriting the + subtask text with a reconstruction is high-risk. Leave it off to keep + the original VLM subtask text and treat the record as additive + metadata; only flip it on for datasets you've verified render + cleanly. When ``emit_record_row=True`` (default), the structured + record is also emitted as a row with ``style="action_record"`` so + downstream consumers can train on the typed schema directly — + without touching the subtask text. Cost: one extra VLM call per subtask. For an 8-subtask episode this means ~8x more VLM calls in the plan module — still cheap relative @@ -110,9 +116,11 @@ class ActionRecordsConfig: # When True, replace the VLM-generated subtask text with the # deterministic template's rendering of the structured record. - # Strongly recommended — it's the whole point of the structured - # intermediate. Set False to keep both representations side by side. - replace_subtask_text: bool = True + # OFF by default — see class docstring. Overwriting good subtask + # text with a reconstruction of hallucinated structured fields is + # high-risk (navigation / non-manipulation tasks render to + # nonsense). Keep records additive (``emit_record_row``) instead. + replace_subtask_text: bool = False # When True, emit a separate row with ``style="action_record"`` and # ``content=json.dumps(record)`` at the subtask's start timestamp. diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index f58ec2c91..6ef5352b0 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -424,6 +424,13 @@ class PlanSubtasksMemoryModule: if not verb: return "" + # Drop a degenerate destination that just echoes the object — the + # VLM sometimes fills both with the same noun (e.g. navigation: + # ``verb=move object=stove destination=stove`` → "move stove to + # stove"). Treat that as "no meaningful destination". + if dest and obj and dest.strip().lower() == obj.strip().lower(): + dest = "" + parts: list[str] = [verb] if obj: parts.append(obj) @@ -431,7 +438,7 @@ class PlanSubtasksMemoryModule: # Pick a sensible preposition per verb family. if verb in {"place", "put", "drop", "insert", "pour", "dump"}: parts.append(f"in {dest}") - elif verb in {"move", "transport", "reach"}: + elif verb in {"move", "transport", "reach", "navigate"}: parts.append(f"to {dest}") else: parts.append(f"at {dest}") diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt index d8cd13104..8b19a0a8e 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt @@ -37,9 +37,16 @@ Axes and target counts: orientation, grasp_method) appear in the original task. Hard rules: -- Each variant MUST preserve the core action and the target object. - Do not change which object is involved, the destination, or the - high-level action. +- Each variant MUST preserve the core action, the target object, AND + the goal / destination. Do not change which object is involved, where + it goes, or the high-level action. "Navigate to the stove" may become + "go to the stove" or "head over to the stove" — it must NEVER become + "wander around the kitchen", "explore the room", or anything that + drops or generalises the stove destination. If you cannot vary the + wording without changing the goal, emit fewer variants. +- Only the FIVE listed elements (wording, arm, orientation, grasp + method, or a combination) may be varied or omitted. The verb's + meaning, the object, and the destination are fixed. - Each variant is plain prose, no markdown, no quotes, no list numbers. - Each variant must be DISTINCT from every other variant in the entire output, both within and across axes. Near-duplicates are not allowed.