From 182f10184f78523cb07d6b4325428727726bd974 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 19 May 2026 14:07:23 +0200 Subject: [PATCH] revert(annotate): move pipeline changes to base PR (#3471) The deterministic-plan rewrite, single-frame VQA (K 3->1), dataset version tagging, telegraphic-subtask prompt and shorter interjection prompt belong in the annotation pipeline itself, not in the SmolVLA training PR. They have been applied to feat/language-annotation- pipeline (#3471). Reverting these six files here to the merge-base so they drop out of this PR's diff; #3491 will inherit the canonical versions when it next rebases on its base. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 9 +- .../modules/plan_subtasks_memory.py | 111 ++++++++---------- .../prompts/module_1_plan.txt | 18 +++ .../prompts/module_1_subtasks.txt | 27 ++--- .../prompts/module_2_interjection.txt | 18 +-- src/lerobot/scripts/lerobot_annotate.py | 37 ------ 6 files changed, 90 insertions(+), 130 deletions(-) create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 8c0d48f0a..678930784 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -114,14 +114,7 @@ class Module3Config: enabled: bool = True vqa_emission_hz: float = 1.0 - K: int = 1 - """How many *consecutive* frames each emission tick anchors a VQA pair - to. The VLM grounds its answer (bbox / keypoint coordinates, count, …) - against the *first* anchored frame's image, so anchoring K>1 frames - copies that same answer onto later frames where the scene has already - moved — stale labels. Default ``1``: a VQA pair lands on exactly its - emission frame, no temporal smear. Raise it only to trade label - precision for more (noisier) VQA frames.""" + K: int = 3 question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial") diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index 15ad5d287..c48d888fb 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -116,29 +116,18 @@ class PlanSubtasksMemoryModule: "tool_calls": None, } ) - # Plan rows at every subtask boundary — including t=0 (start of - # the first subtask). Because the plan is just a numbered list - # of *still-todo* subtasks, re-emitting at each boundary makes - # the active plan shrink as work progresses: at frame t the - # rendered ``${plan}`` is the most recent emission, which - # contains exactly the subtasks that started at or after the - # current span. Saves the runtime from having to derive - # "what's still left" at inference time. - for span in subtask_spans: - boundary_t = _snap_to_frame(span["start"], record.frame_timestamps) - plan_text = self._generate_plan( - record, subtask_spans, refresh_t=boundary_t, task=effective_task + # plan row at t=0 + plan_text = self._generate_plan(record, subtask_spans, task=effective_task) + if plan_text is not None: + rows.append( + { + "role": "assistant", + "content": plan_text, + "style": "plan", + "timestamp": float(t0), + "tool_calls": None, + } ) - if plan_text is not None: - rows.append( - { - "role": "assistant", - "content": plan_text, - "style": "plan", - "timestamp": float(boundary_t), - "tool_calls": None, - } - ) # memory rows at every subtask boundary except the very first start prior_memory = "" for i, span in enumerate(subtask_spans[1:], start=1): @@ -383,50 +372,54 @@ class PlanSubtasksMemoryModule: def _generate_plan( self, - record: EpisodeRecord, # noqa: ARG002 (kept for signature stability) + record: EpisodeRecord, subtask_spans: Sequence[dict[str, Any]], *, refresh_t: float | None = None, - interjection: str | None = None, # noqa: ARG002 - task: str | None = None, # noqa: ARG002 + interjection: str | None = None, + task: str | None = None, ) -> str | None: - """Deterministic plan = numbered list of *still-todo* subtasks. - - Previously this called the VLM with a prompt that asked it to - compress the subtasks into a "compact hierarchical plan". That - produced longer-than-necessary plans, cost an extra VLM round-trip - per episode (plus one per interjection on refresh), and could - diverge from the actual subtask sequence the model is going to - execute. Replacing it with a plain summarisation keeps the plan - tightly aligned with the upcoming subtasks and removes the VLM - call entirely. - - Layout (matches the v2 plan style — short imperative fragments - prefixed by "N. "): - - 1. - 2. - ... - - On a refresh at ``refresh_t`` (called from ``run_plan_updates`` - on interjection events), only subtasks whose start is at or - after ``refresh_t`` are included — the plan shrinks as work - progresses, so it always describes what's left. - """ if not subtask_spans: return None - remaining = [ - s for s in subtask_spans - if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t) - ] - if not remaining: - # Past the last subtask boundary on a late refresh — nothing - # left to plan; emit None so the caller skips the row. - return None - return "\n".join( - f"{i}. {span.get('text', '').strip()}" - for i, span in enumerate(remaining, start=1) + subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans) + prompt = load_prompt("module_1_plan").format( + episode_task=(task if task is not None else record.episode_task), + subtasks_text=subtasks_text, + plan_max_steps=self.config.plan_max_steps, ) + if refresh_t is not None: + # ``current_subtask`` is the span the refresh time falls into, + # so the model knows where in the demonstration the planner is + # standing when it re-emits. + current_subtask = "" + for span in subtask_spans: + if float(span["start"]) <= refresh_t and ( + "end" not in span or float(span["end"]) > refresh_t + ): + current_subtask = span.get("text", "") + break + if interjection: + prompt += ( + f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user " + f"interjection: {interjection!r}. Current subtask just " + f"before the interjection: {current_subtask!r}. Update " + f"the plan so it reflects the interjection — drop or " + f"reorder steps as needed; do not just restate.)\n" + ) + else: + # Refresh without an interjection text: still tell the model + # where in the episode the plan stands so the re-emission + # is grounded. Should be rare — plan refreshes are + # interjection-driven by design. + prompt += ( + f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current " + f"subtask: {current_subtask!r}.)\n" + ) + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + result = self.vlm.generate_json([messages])[0] + if isinstance(result, dict) and isinstance(result.get("plan"), str): + return result["plan"].strip() + return None def _generate_memory( self, diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt new file mode 100644 index 000000000..b0121c977 --- /dev/null +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt @@ -0,0 +1,18 @@ +You are the high-level planner for a robot demonstrating: "{episode_task}". + +Given the subtask decomposition below, write a concise hierarchical PLAN +the robot should follow. Format the plan as a numbered list, one line per +high-level step. The plan describes the full task; subtasks are the atomic +skills used to execute it. + +Subtasks for context: +{subtasks_text} + +Authoring rules: +- 3 to {plan_max_steps} steps. +- Each step describes one logical chunk of the task, not one motion. +- Steps must be in execution order. +- Plain prose, no JSON, no markdown headers. + +Output strictly valid JSON: + {{ "plan": "1. ...\n2. ...\n3. ..." }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt index 56d14d42f..5d7c9cc8d 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt @@ -4,24 +4,17 @@ The user originally asked: "{episode_task}" You are shown the entire demonstration as a single video. Watch the whole clip, then segment it into a list of consecutive atomic subtasks -the robot performs. Write **telegraphic** action labels. +the robot performs. -Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts: +Authoring rules — based on Hi Robot (Shi 2025) atom granularity and +Pi0.7 (Physical Intelligence 2025) "how, not what" detail: -- Each subtask = one atomic skill the low-level policy can execute. -- **Hard length cap: ≤ 4 words.** Ideally 2-3. Form: VERB + (color) + - OBJECT. No articles ("the", "a"), no destinations, no adverbs, no - "robot"/"arm"/"gripper" — those are implied. -- **Use the exact object nouns from the task above.** If the task says - "cube", every subtask says "cube" — never switch to "block". If it - says "box", never switch to "bin"/"container". Consistent vocabulary - across the whole episode. -- Good: "move to blue cube", "grasp blue cube", "lift blue cube", - "place blue cube", "open drawer", "release yellow cube". -- Bad: "release the yellow block into the green bin" (articles, - destination, "block" instead of "cube"), "the robot arm moves - towards the blue cube" ("the robot arm", too long), "carefully - pick up the cube" (adverb, article). +- Each subtask is one atomic skill the low-level policy can execute, + e.g. "pick up one piece of lettuce", "place the bowl into the box", + "move the right arm to the left". +- Capture HOW the subtask is performed, not only WHAT — e.g. prefer + "grasp the handle of the sponge with the left hand" to "pick up the + sponge". - Subtasks are non-overlapping and cover the full episode in order. Choose the cut points yourself based on what you see in the video (gripper open/close events, contact, regrasps, transitions). @@ -34,7 +27,7 @@ Output strictly valid JSON of shape: {{ "subtasks": [ - {{"text": "<≤4-word verb phrase>", "start": , "end": }}, + {{"text": "", "start": , "end": }}, ... ] }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt index 4a4719f54..d6f77883f 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt @@ -14,10 +14,12 @@ subtask boundary in the demonstration: - Subtask the robot is about to start: "{next_subtask}" - Time into episode: {timestamp:.2f}s -Write ONE compact interjection the user would naturally say at this -moment to prompt / confirm / encourage the robot to do "{next_subtask}". -Keep it like a mid-task coaching cue, not a full instruction paragraph. -Also write the robot's compact verbal acknowledgement. +Write ONE interjection the user would naturally say at this moment to +prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it +like a real human mid-task remark — conversational, varied, sometimes +just a nudge, sometimes a clarification, sometimes a small constraint +that the upcoming motion happens to satisfy. Plus the robot's verbal +acknowledgement. Hard rules: @@ -27,9 +29,7 @@ Hard rules: instead", DO NOT — those would contradict the demonstration. - The interjection must reference an object, location, or action that is plausible given the visible scene and the next subtask text. -- One short phrase or sentence each. Conversational, not robotic. -- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}." -- Keep robot speech very short: "OK.", "On it.", "Doing that." +- One sentence each. Conversational, not robotic. Style examples (vary the phrasing — don't reuse these verbatim): - "Now go ahead and {next_subtask}." @@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim): Output strictly valid JSON: {{ - "interjection": "", - "speech": "" + "interjection": "", + "speech": "" }} diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py index b58ea26a2..61148b1a4 100644 --- a/src/lerobot/scripts/lerobot_annotate.py +++ b/src/lerobot/scripts/lerobot_annotate.py @@ -141,43 +141,6 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None: ) print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True) - # Tag the upload with the codebase version. ``LeRobotDatasetMetadata`` - # resolves the dataset revision via ``get_safe_version`` which scans - # for tags like ``v3.0``; without a tag it raises - # ``RevisionNotFoundError``. Read the version straight from the - # dataset's own ``meta/info.json`` so we tag whatever the writer - # actually wrote (no accidental drift if the codebase floor moves). - from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415 - - info_path = root / "meta" / "info.json" - version_tag = CODEBASE_VERSION - if info_path.exists(): - try: - from lerobot.utils.io_utils import load_json # noqa: PLC0415 - - info = load_json(info_path) - ds_version = info.get("codebase_version") - if isinstance(ds_version, str) and ds_version.startswith("v"): - version_tag = ds_version - except Exception as exc: # noqa: BLE001 - print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True) - try: - api.create_tag( - repo_id=repo_id, - tag=version_tag, - repo_type="dataset", - exist_ok=True, - ) - print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True) - except Exception as exc: # noqa: BLE001 - print( - f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. " - "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. " - "Run: from huggingface_hub import HfApi; " - f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)", - flush=True, - ) - def main() -> None: annotate()