diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index dd439f9b9..f6b9204bc 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -83,7 +83,14 @@ class VqaConfig: enabled: bool = True vqa_emission_hz: float = 1.0 - K: int = 3 + K: int = 1 + """How many *consecutive* frames each emission tick anchors a VQA pair + to. The VLM grounds its answer (bbox / keypoint coordinates, count, …) + against the *first* anchored frame's image, so anchoring K>1 frames + copies that same answer onto later frames where the scene has already + moved — stale labels. Default ``1``: a VQA pair lands on exactly its + emission frame, no temporal smear. Raise it only to trade label + precision for more (noisier) VQA frames.""" question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial") diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index e07f9cc3b..0218e79b1 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -104,18 +104,29 @@ class PlanSubtasksMemoryModule: "tool_calls": None, } ) - # plan row at t=0 - plan_text = self._generate_plan(record, subtask_spans, task=effective_task) - if plan_text is not None: - rows.append( - { - "role": "assistant", - "content": plan_text, - "style": "plan", - "timestamp": float(t0), - "tool_calls": None, - } + # Plan rows at every subtask boundary — including t=0 (start of + # the first subtask). Because the plan is just a numbered list + # of *still-todo* subtasks, re-emitting at each boundary makes + # the active plan shrink as work progresses: at frame t the + # rendered ``${plan}`` is the most recent emission, which + # contains exactly the subtasks that started at or after the + # current span. Saves the runtime from having to derive + # "what's still left" at inference time. + for span in subtask_spans: + boundary_t = snap_to_frame(span["start"], record.frame_timestamps) + plan_text = self._generate_plan( + record, subtask_spans, refresh_t=boundary_t, task=effective_task ) + if plan_text is not None: + rows.append( + { + "role": "assistant", + "content": plan_text, + "style": "plan", + "timestamp": float(boundary_t), + "tool_calls": None, + } + ) # memory rows at every subtask boundary except the very first start prior_memory = "" for i, span in enumerate(subtask_spans[1:], start=1): @@ -327,48 +338,50 @@ class PlanSubtasksMemoryModule: def _generate_plan( self, - record: EpisodeRecord, + record: EpisodeRecord, # noqa: ARG002 (kept for signature stability) subtask_spans: Sequence[dict[str, Any]], *, refresh_t: float | None = None, - interjection: str | None = None, - task: str | None = None, + interjection: str | None = None, # noqa: ARG002 + task: str | None = None, # noqa: ARG002 ) -> str | None: + """Deterministic plan = numbered list of *still-todo* subtasks. + + Previously this called the VLM with a prompt that asked it to + compress the subtasks into a "compact hierarchical plan". That + produced longer-than-necessary plans, cost an extra VLM round-trip + per episode (plus one per interjection on refresh), and could + diverge from the actual subtask sequence the model is going to + execute. Replacing it with a plain summarisation keeps the plan + tightly aligned with the upcoming subtasks and removes the VLM + call entirely. + + Layout — short imperative fragments prefixed by "N. ": + + 1. + 2. + ... + + On a refresh at ``refresh_t`` (called from ``run_plan_updates`` + on interjection events, and from ``run_episode`` at every subtask + boundary), only subtasks whose start is at or after ``refresh_t`` + are included — the plan shrinks as work progresses, so it always + describes what's left. + """ if not subtask_spans: return None - subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans) - prompt = load_prompt("module_1_plan").format( - episode_task=(task if task is not None else record.episode_task), - subtasks_text=subtasks_text, - plan_max_steps=self.config.plan_max_steps, + remaining = [ + s + for s in subtask_spans + if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t) + ] + if not remaining: + # Past the last subtask boundary on a late refresh — nothing + # left to plan; emit None so the caller skips the row. + return None + return "\n".join( + f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1) ) - if refresh_t is not None: - # ``current_subtask`` is the span the refresh time falls into, - # so the model knows where in the demonstration the planner is - # standing when it re-emits. - current_subtask = "" - for span in subtask_spans: - if float(span["start"]) <= refresh_t and ( - "end" not in span or float(span["end"]) > refresh_t - ): - current_subtask = span.get("text", "") - break - if interjection: - prompt += ( - f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user " - f"interjection: {interjection!r}. Current subtask just " - f"before the interjection: {current_subtask!r}. Update " - f"the plan so it reflects the interjection — drop or " - f"reorder steps as needed; do not just restate.)\n" - ) - else: - # Refresh without an interjection text: still tell the model - # where in the episode the plan stands so the re-emission - # is grounded. Should be rare — plan refreshes are - # interjection-driven by design. - prompt += f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current subtask: {current_subtask!r}.)\n" - plan = self._vlm_field(self._text_message(prompt), "plan") - return plan.strip() if isinstance(plan, str) else None def _generate_memory( self, diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt deleted file mode 100644 index b0121c977..000000000 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt +++ /dev/null @@ -1,18 +0,0 @@ -You are the high-level planner for a robot demonstrating: "{episode_task}". - -Given the subtask decomposition below, write a concise hierarchical PLAN -the robot should follow. Format the plan as a numbered list, one line per -high-level step. The plan describes the full task; subtasks are the atomic -skills used to execute it. - -Subtasks for context: -{subtasks_text} - -Authoring rules: -- 3 to {plan_max_steps} steps. -- Each step describes one logical chunk of the task, not one motion. -- Steps must be in execution order. -- Plain prose, no JSON, no markdown headers. - -Output strictly valid JSON: - {{ "plan": "1. ...\n2. ...\n3. ..." }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt index cd1303cbe..c530b1340 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt @@ -4,20 +4,30 @@ The user originally asked: "{episode_task}" You are shown the entire demonstration as a single video. Watch the whole clip, then segment it into a list of consecutive atomic subtasks -the robot performs. +the robot performs. Write short, telegraphic action labels. -Authoring rules — based on Hi Robot (Shi 2025) atom granularity: +Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts: -- Each subtask is one atomic skill the low-level policy can execute, - e.g. "pick up the orange", "place the bowl into the box". -- Write each subtask as an IMPERATIVE COMMAND to the robot, starting - with a verb: move, reach, pick up, grasp, place, put, push, pull, - open, close, turn, press, lift, insert, pour... +- Each subtask = one atomic skill the low-level policy can execute. +- Write each subtask as an IMPERATIVE COMMAND, starting with a verb: + move, reach, pick up, grasp, place, put, push, pull, open, close, + turn, press, lift, insert, pour... +- Keep it SHORT — a verb phrase, not a sentence. Drop articles + ("the", "a") and adverbs ("carefully", "slowly"). Add a "how" + detail (which hand, which grasp point) ONLY when it is needed to + disambiguate. - NEVER use third person. Never write "the robot", "the arm", "the - gripper moves", "it picks up". Command the robot, do not describe it. -- Keep it SHORT — 3 to 8 words. Add a "how" detail (which hand, which - grasp point) ONLY when it is needed to disambiguate. -- Lower-case, no trailing period. + gripper moves", "it picks up" — the robot is implied. Command it, + do not describe it. +- Use the exact object nouns from the task above. If the task says + "cube", every subtask says "cube" — never switch to "block". If it + says "box", never switch to "bin"/"container". Keep vocabulary + consistent across the whole episode. +- Good: "move to blue cube", "grasp blue cube", "lift blue cube", + "place blue cube in box", "open drawer", "release yellow cube". +- Bad: "the robot arm moves towards the blue cube" (third person, + too long), "carefully pick up the cube" (adverb, article), + "release the yellow block" ("block" when the task said "cube"). - Subtasks are non-overlapping and cover the full episode in order. Choose the cut points yourself based on what you see in the video (gripper open/close events, contact, regrasps, transitions). @@ -26,22 +36,11 @@ Authoring rules — based on Hi Robot (Shi 2025) atom granularity: - Every subtask's [start_time, end_time] must lie within [0.0, {episode_duration}] seconds. -Style examples: - - Good Bad (do NOT produce these) - "pick up the orange" "the robot arm moves to the orange" - "move to the yellow block" "the gripper approaches the block" - "close gripper to grasp "close the gripper to grasp the - the yellow cube" yellow cube so it can lift it" - "open the toaster oven" "it opens the toaster oven door" - "put the bagel on the "the white plate now has the bagel - white plate" placed on it by the arm" - Output strictly valid JSON of shape: {{ "subtasks": [ - {{"text": "", "start": , "end": }}, + {{"text": "", "start": , "end": }}, ... ] }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt index d6f77883f..4a4719f54 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt @@ -14,12 +14,10 @@ subtask boundary in the demonstration: - Subtask the robot is about to start: "{next_subtask}" - Time into episode: {timestamp:.2f}s -Write ONE interjection the user would naturally say at this moment to -prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it -like a real human mid-task remark — conversational, varied, sometimes -just a nudge, sometimes a clarification, sometimes a small constraint -that the upcoming motion happens to satisfy. Plus the robot's verbal -acknowledgement. +Write ONE compact interjection the user would naturally say at this +moment to prompt / confirm / encourage the robot to do "{next_subtask}". +Keep it like a mid-task coaching cue, not a full instruction paragraph. +Also write the robot's compact verbal acknowledgement. Hard rules: @@ -29,7 +27,9 @@ Hard rules: instead", DO NOT — those would contradict the demonstration. - The interjection must reference an object, location, or action that is plausible given the visible scene and the next subtask text. -- One sentence each. Conversational, not robotic. +- One short phrase or sentence each. Conversational, not robotic. +- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}." +- Keep robot speech very short: "OK.", "On it.", "Doing that." Style examples (vary the phrasing — don't reuse these verbatim): - "Now go ahead and {next_subtask}." @@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim): Output strictly valid JSON: {{ - "interjection": "", - "speech": "" + "interjection": "", + "speech": "" }} diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py index aefe0e9eb..86d3ab3fa 100644 --- a/src/lerobot/scripts/lerobot_annotate.py +++ b/src/lerobot/scripts/lerobot_annotate.py @@ -149,6 +149,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None: ) print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True) + # Tag the upload with the codebase version. ``LeRobotDatasetMetadata`` + # resolves the dataset revision via ``get_safe_version`` which scans + # for tags like ``v3.0``; without a tag it raises + # ``RevisionNotFoundError``. Read the version straight from the + # dataset's own ``meta/info.json`` so we tag whatever the writer + # actually wrote (no accidental drift if the codebase floor moves). + from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415 + + info_path = root / "meta" / "info.json" + version_tag = CODEBASE_VERSION + if info_path.exists(): + try: + from lerobot.utils.io_utils import load_json # noqa: PLC0415 + + info = load_json(info_path) + ds_version = info.get("codebase_version") + if isinstance(ds_version, str) and ds_version.startswith("v"): + version_tag = ds_version + except Exception as exc: # noqa: BLE001 + print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True) + try: + api.create_tag( + repo_id=repo_id, + tag=version_tag, + repo_type="dataset", + exist_ok=True, + ) + print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True) + except Exception as exc: # noqa: BLE001 + print( + f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. " + "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. " + "Run: from huggingface_hub import HfApi; " + f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)", + flush=True, + ) + def main() -> None: annotate() diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py index 5b413794a..73685a079 100644 --- a/tests/annotations/test_modules.py +++ b/tests/annotations/test_modules.py @@ -80,7 +80,6 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path: {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1}, ] }, - "concise hierarchical PLAN": {"plan": "1. grasp\n2. wipe\n3. place"}, "Update the memory": {"memory": "wiped the counter once"}, }, ) @@ -96,10 +95,16 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path: frame_set = set(record.frame_timestamps) for row in rows: assert row["timestamp"] in frame_set - # exactly one plan row at t0 - plan_rows = [r for r in rows if r["style"] == "plan"] - assert len(plan_rows) == 1 + # one plan row per subtask boundary; the first lands at t0 and each + # plan is the deterministic numbered list of still-todo subtasks + plan_rows = sorted((r for r in rows if r["style"] == "plan"), key=lambda r: r["timestamp"]) + subtask_rows = [r for r in rows if r["style"] == "subtask"] + assert len(plan_rows) == len(subtask_rows) assert plan_rows[0]["timestamp"] == record.frame_timestamps[0] + # the t0 plan enumerates all subtasks; later plans shrink + assert plan_rows[0]["content"].startswith("1. ") + assert len(plan_rows[0]["content"].splitlines()) == len(subtask_rows) + assert len(plan_rows[-1]["content"].splitlines()) == 1 def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: Path, tmp_path: Path) -> None: