diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index f669593e9..8ce22c28f 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -71,10 +71,10 @@ CMD = ( "--plan.n_task_rephrasings=0 " # action_records OFF: the structured {verb,object,arm,grasp,dest} # schema is a manipulation schema; RoboCasa navigation / atomic tasks - # don't fit it and the VLM hallucinates (e.g. "move stove to stove"). - # Leave off unless annotating long composite manipulation tasks you've - # verified render cleanly (and even then replace_subtask_text stays - # off by default so records are additive, never overwriting subtasks). + # don't fit it and the VLM hallucinates. When on, records are purely + # additive (emitted as style="action_record" rows) and never touch + # the subtask text — useful only for long composite manipulation + # tasks. Leave off for RoboCasa atomic / navigation. # Keep subtask decomposition tight for atomic tasks: "--plan.plan_max_steps=6 " # Phase 2 — interjections + speech. diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index a3c1306be..f84fdaa08 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -92,20 +92,16 @@ class ActionRecordsConfig: "mistake": "" | null, } - A deterministic Python template then renders the record back to - canonical subtask text (e.g. ``pick blue cube with left arm using - pinch grip``). When ``replace_subtask_text=True``, the rendered text - REPLACES the VLM's free-form subtask text. This is OFF by default: - the structured fields are easy for the VLM to hallucinate on tasks - that don't fit the manipulation schema (e.g. navigation tasks yield - nonsense like ``move stove to stove``), and silently overwriting the - subtask text with a reconstruction is high-risk. Leave it off to keep - the original VLM subtask text and treat the record as additive - metadata; only flip it on for datasets you've verified render - cleanly. When ``emit_record_row=True`` (default), the structured - record is also emitted as a row with ``style="action_record"`` so - downstream consumers can train on the typed schema directly — - without touching the subtask text. + The record is emitted as a separate row with ``style="action_record"`` + (``content=json.dumps(record)``) at the subtask's start timestamp. + It is PURELY ADDITIVE — it never touches the VLM's subtask text. + Downstream training can consume the typed schema directly (e.g. + auxiliary supervision on verb / arm / grasp classification heads) + while the subtask string the policy conditions on stays exactly what + the subtask module produced. (Reconstructing subtask text from these + fields was too easy for the VLM to hallucinate on tasks that don't + fit the manipulation schema — navigation tasks yielded nonsense like + ``move stove to stove`` — so that path was removed.) Cost: one extra VLM call per subtask. For an 8-subtask episode this means ~8x more VLM calls in the plan module — still cheap relative @@ -114,18 +110,10 @@ class ActionRecordsConfig: enabled: bool = False - # When True, replace the VLM-generated subtask text with the - # deterministic template's rendering of the structured record. - # OFF by default — see class docstring. Overwriting good subtask - # text with a reconstruction of hallucinated structured fields is - # high-risk (navigation / non-manipulation tasks render to - # nonsense). Keep records additive (``emit_record_row``) instead. - replace_subtask_text: bool = False - - # When True, emit a separate row with ``style="action_record"`` and - # ``content=json.dumps(record)`` at the subtask's start timestamp. - # Lets downstream training consume the typed schema directly (e.g. - # auxiliary supervision on verb/arm/grasp classification heads). + # When True (default), emit a separate row with ``style="action_record"`` + # and ``content=json.dumps(record)`` at the subtask's start timestamp. + # This is the only output of the feature — set ``enabled=False`` to + # skip the extra VLM calls entirely. emit_record_row: bool = True # Frame sampling for the per-subtask VLM call (similar to the diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index 6ef5352b0..5e66f67be 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -124,28 +124,24 @@ class PlanSubtasksMemoryModule: subtask_spans = self._generate_subtasks(record, task=effective_task) # ---------------------------------------------------------------- - # Phase 1a + 1b: structured per-subtask action records + # Phase 1a: structured per-subtask action records (additive) # ---------------------------------------------------------------- # When enabled, for every subtask span we ask the VLM for a typed # ActionRecord (verb / object / arm / grasp_type / destination / - # mistake). A deterministic Python template renders the record - # back to canonical subtask text. The render replaces the - # free-form subtask text (cleaner conditioning) and the typed - # record is emitted as a separate row for downstream use. + # mistake) and emit it as a separate ``style="action_record"`` + # row for downstream use. This is purely additive — it never + # touches the VLM's subtask text (reconstructing subtask text + # from these fields was too easy to hallucinate on tasks that + # don't fit the manipulation schema). records_cfg = self.config.action_records action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans) if records_cfg.enabled and subtask_spans: for i, span in enumerate(subtask_spans): rec = self._extract_action_record(record, span, effective_task) - if rec is None: - continue - action_records[i] = rec - if records_cfg.replace_subtask_text: - canonical_text = self._render_action_record_to_subtask_text(rec) - if canonical_text: - span["text"] = canonical_text + if rec is not None: + action_records[i] = rec - # subtask rows (may now reflect canonical-rendered text) + # subtask rows for i, span in enumerate(subtask_spans): rows.append( { @@ -396,60 +392,6 @@ class PlanSubtasksMemoryModule: "mistake": mistake, } - @staticmethod - def _render_action_record_to_subtask_text(record: dict[str, Any]) -> str: - """Deterministic template: ``ActionRecord`` → canonical subtask text. - - Mirrors the authoring guidance in ``module_1_subtasks.txt``: - imperative, drop articles / adverbs, use canonical object nouns, - append arm / grasp clauses only when present. - - Examples (record → rendered text):: - - {verb=pick, object=blue cube} - → "pick blue cube" - {verb=pick, object=blue cube, arm=left, grasp_type=pinch} - → "pick blue cube with left arm using pinch grip" - {verb=place, object=blue cube, destination=green box} - → "place blue cube in green box" - {verb=move, object=mug, destination=stove} - → "move mug to stove" - """ - verb = (record.get("verb") or "").strip().lower() - obj = (record.get("object") or "").strip() - arm = (record.get("arm") or "").strip().lower() if record.get("arm") else "" - grasp = (record.get("grasp_type") or "").strip().lower() if record.get("grasp_type") else "" - dest = (record.get("destination") or "").strip() if record.get("destination") else "" - - if not verb: - return "" - - # Drop a degenerate destination that just echoes the object — the - # VLM sometimes fills both with the same noun (e.g. navigation: - # ``verb=move object=stove destination=stove`` → "move stove to - # stove"). Treat that as "no meaningful destination". - if dest and obj and dest.strip().lower() == obj.strip().lower(): - dest = "" - - parts: list[str] = [verb] - if obj: - parts.append(obj) - if dest: - # Pick a sensible preposition per verb family. - if verb in {"place", "put", "drop", "insert", "pour", "dump"}: - parts.append(f"in {dest}") - elif verb in {"move", "transport", "reach", "navigate"}: - parts.append(f"to {dest}") - else: - parts.append(f"at {dest}") - if arm == "both": - parts.append("with both arms") - elif arm in {"left", "right"}: - parts.append(f"with {arm} arm") - if grasp: - parts.append(f"using {grasp} grip") - return " ".join(parts) - # ------------------------------------------------------------------ # Structured 5-axis task augmentation (EgoMimic-style taxonomy) # ------------------------------------------------------------------