feat(annotate): deterministic plan, single-frame VQA, dataset tagging

Port the steerable-pipeline refinements developed on feat/smolvla-on- steerable back into the annotation pipeline itself: - module_1_subtasks: imperative verb-first telegraphic labels with a consistent-object-noun rule and good/bad examples (no hard word cap). - _generate_plan: drop the VLM round-trip; the plan is now a deterministic numbered list of still-todo subtasks, re-emitted at every subtask boundary so it shrinks as work progresses. Removes module_1_plan.txt. - VqaConfig.K 3 -> 1: a VQA pair anchors exactly its emission frame, no stale-label temporal smear. - lerobot-annotate: tag the pushed dataset with its codebase_version so LeRobotDataset can resolve a revision and load it. - module_2_interjection: shorter, more natural mid-task cues. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 02:36:11 +00:00 · 2026-05-19 14:06:15 +02:00
parent 26013da699
commit ce47075d6b
7 changed files with 144 additions and 101 deletions
@@ -83,7 +83,14 @@ class VqaConfig:
    enabled: bool = True
    vqa_emission_hz: float = 1.0
-    K: int = 3
+    K: int = 1
    """How many *consecutive* frames each emission tick anchors a VQA pair
    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
    against the *first* anchored frame's image, so anchoring K>1 frames
    copies that same answer onto later frames where the scene has already
    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
    emission frame, no temporal smear. Raise it only to trade label
    precision for more (noisier) VQA frames."""
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
@@ -104,18 +104,29 @@ class PlanSubtasksMemoryModule:
                    "tool_calls": None,
                }
            )
-        # plan row at t=0
+        # Plan rows at every subtask boundary — including t=0 (start of
-        plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
+        # the first subtask). Because the plan is just a numbered list
-        if plan_text is not None:
+        # of *still-todo* subtasks, re-emitting at each boundary makes
-            rows.append(
+        # the active plan shrink as work progresses: at frame t the
-                {
+        # rendered ``${plan}`` is the most recent emission, which
-                    "role": "assistant",
+        # contains exactly the subtasks that started at or after the
-                    "content": plan_text,
+        # current span. Saves the runtime from having to derive
-                    "style": "plan",
+        # "what's still left" at inference time.
-                    "timestamp": float(t0),
+        for span in subtask_spans:
-                    "tool_calls": None,
+            boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
-                }
+            plan_text = self._generate_plan(
                record, subtask_spans, refresh_t=boundary_t, task=effective_task
            )
            if plan_text is not None:
                rows.append(
                    {
                        "role": "assistant",
                        "content": plan_text,
                        "style": "plan",
                        "timestamp": float(boundary_t),
                        "tool_calls": None,
                    }
                )
        # memory rows at every subtask boundary except the very first start
        prior_memory = ""
        for i, span in enumerate(subtask_spans[1:], start=1):
@@ -327,48 +338,50 @@ class PlanSubtasksMemoryModule:
    def _generate_plan(
        self,
-        record: EpisodeRecord,
+        record: EpisodeRecord,  # noqa: ARG002  (kept for signature stability)
        subtask_spans: Sequence[dict[str, Any]],
        *,
        refresh_t: float | None = None,
-        interjection: str | None = None,
+        interjection: str | None = None,  # noqa: ARG002
-        task: str | None = None,
+        task: str | None = None,  # noqa: ARG002
    ) -> str | None:
        """Deterministic plan = numbered list of *still-todo* subtasks.
        Previously this called the VLM with a prompt that asked it to
        compress the subtasks into a "compact hierarchical plan". That
        produced longer-than-necessary plans, cost an extra VLM round-trip
        per episode (plus one per interjection on refresh), and could
        diverge from the actual subtask sequence the model is going to
        execute. Replacing it with a plain summarisation keeps the plan
        tightly aligned with the upcoming subtasks and removes the VLM
        call entirely.
        Layout — short imperative fragments prefixed by "N. ":
            1. <subtask 1>
            2. <subtask 2>
            ...
        On a refresh at ``refresh_t`` (called from ``run_plan_updates``
        on interjection events, and from ``run_episode`` at every subtask
        boundary), only subtasks whose start is at or after ``refresh_t``
        are included — the plan shrinks as work progresses, so it always
        describes what's left.
        """
        if not subtask_spans:
            return None
-        subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
+        remaining = [
-        prompt = load_prompt("module_1_plan").format(
+            s
-            episode_task=(task if task is not None else record.episode_task),
+            for s in subtask_spans
-            subtasks_text=subtasks_text,
+            if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
-            plan_max_steps=self.config.plan_max_steps,
+        ]
        if not remaining:
            # Past the last subtask boundary on a late refresh — nothing
            # left to plan; emit None so the caller skips the row.
            return None
        return "\n".join(
            f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1)
        )
        if refresh_t is not None:
            # ``current_subtask`` is the span the refresh time falls into,
            # so the model knows where in the demonstration the planner is
            # standing when it re-emits.
            current_subtask = ""
            for span in subtask_spans:
                if float(span["start"]) <= refresh_t and (
                    "end" not in span or float(span["end"]) > refresh_t
                ):
                    current_subtask = span.get("text", "")
                    break
            if interjection:
                prompt += (
                    f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
                    f"interjection: {interjection!r}. Current subtask just "
                    f"before the interjection: {current_subtask!r}. Update "
                    f"the plan so it reflects the interjection — drop or "
                    f"reorder steps as needed; do not just restate.)\n"
                )
            else:
                # Refresh without an interjection text: still tell the model
                # where in the episode the plan stands so the re-emission
                # is grounded. Should be rare — plan refreshes are
                # interjection-driven by design.
                prompt += f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current subtask: {current_subtask!r}.)\n"
        plan = self._vlm_field(self._text_message(prompt), "plan")
        return plan.strip() if isinstance(plan, str) else None
    def _generate_memory(
        self,
@@ -1,18 +0,0 @@
 You are the high-level planner for a robot demonstrating: "{episode_task}".
 Given the subtask decomposition below, write a concise hierarchical PLAN
 the robot should follow. Format the plan as a numbered list, one line per
 high-level step. The plan describes the full task; subtasks are the atomic
 skills used to execute it.
 Subtasks for context:
 {subtasks_text}
 Authoring rules:
 - 3 to {plan_max_steps} steps.
 - Each step describes one logical chunk of the task, not one motion.
 - Steps must be in execution order.
 - Plain prose, no JSON, no markdown headers.
 Output strictly valid JSON:
  {{ "plan": "1. ...\n2. ...\n3. ..." }}
@@ -4,20 +4,30 @@ The user originally asked: "{episode_task}"
 You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
-the robot performs.
+the robot performs. Write short, telegraphic action labels.
-Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
+Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
- Each subtask is one atomic skill the low-level policy can execute,
+- Each subtask = one atomic skill the low-level policy can execute.
-  e.g. "pick up the orange", "place the bowl into the box".
+- Write each subtask as an IMPERATIVE COMMAND, starting with a verb:
- Write each subtask as an IMPERATIVE COMMAND to the robot, starting
+  move, reach, pick up, grasp, place, put, push, pull, open, close,
-  with a verb: move, reach, pick up, grasp, place, put, push, pull,
+  turn, press, lift, insert, pour...
-  open, close, turn, press, lift, insert, pour...
+- Keep it SHORT — a verb phrase, not a sentence. Drop articles
  ("the", "a") and adverbs ("carefully", "slowly"). Add a "how"
  detail (which hand, which grasp point) ONLY when it is needed to
  disambiguate.
 - NEVER use third person. Never write "the robot", "the arm", "the
-  gripper moves", "it picks up". Command the robot, do not describe it.
+  gripper moves", "it picks up" — the robot is implied. Command it,
- Keep it SHORT — 3 to 8 words. Add a "how" detail (which hand, which
+  do not describe it.
-  grasp point) ONLY when it is needed to disambiguate.
+- Use the exact object nouns from the task above. If the task says
- Lower-case, no trailing period.
+  "cube", every subtask says "cube" — never switch to "block". If it
  says "box", never switch to "bin"/"container". Keep vocabulary
  consistent across the whole episode.
 - Good: "move to blue cube", "grasp blue cube", "lift blue cube",
  "place blue cube in box", "open drawer", "release yellow cube".
 - Bad: "the robot arm moves towards the blue cube" (third person,
  too long), "carefully pick up the cube" (adverb, article),
  "release the yellow block" ("block" when the task said "cube").
 - Subtasks are non-overlapping and cover the full episode in order.
  Choose the cut points yourself based on what you see in the video
  (gripper open/close events, contact, regrasps, transitions).
@@ -26,22 +36,11 @@ Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
 - Every subtask's [start_time, end_time] must lie within
  [0.0, {episode_duration}] seconds.
 Style examples:
  Good                            Bad (do NOT produce these)
  "pick up the orange"            "the robot arm moves to the orange"
  "move to the yellow block"      "the gripper approaches the block"
  "close gripper to grasp         "close the gripper to grasp the
   the yellow cube"                yellow cube so it can lift it"
  "open the toaster oven"         "it opens the toaster oven door"
  "put the bagel on the           "the white plate now has the bagel
   white plate"                    placed on it by the arm"
 Output strictly valid JSON of shape:
  {{
    "subtasks": [
-      {{"text": "<short imperative command>", "start": <float>, "end": <float>}},
+      {{"text": "<short imperative verb phrase>", "start": <float>, "end": <float>}},
      ...
    ]
  }}
@@ -14,12 +14,10 @@ subtask boundary in the demonstration:
 - Subtask the robot is about to start: "{next_subtask}"
 - Time into episode: {timestamp:.2f}s
-Write ONE interjection the user would naturally say at this moment to
+Write ONE compact interjection the user would naturally say at this
-prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
+moment to prompt / confirm / encourage the robot to do "{next_subtask}".
-like a real human mid-task remark — conversational, varied, sometimes
+Keep it like a mid-task coaching cue, not a full instruction paragraph.
-just a nudge, sometimes a clarification, sometimes a small constraint
+Also write the robot's compact verbal acknowledgement.
 that the upcoming motion happens to satisfy. Plus the robot's verbal
 acknowledgement.
 Hard rules:
@@ -29,7 +27,9 @@ Hard rules:
  instead", DO NOT — those would contradict the demonstration.
 - The interjection must reference an object, location, or action that
  is plausible given the visible scene and the next subtask text.
- One sentence each. Conversational, not robotic.
+- One short phrase or sentence each. Conversational, not robotic.
 - Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
 - Keep robot speech very short: "OK.", "On it.", "Doing that."
 Style examples (vary the phrasing — don't reuse these verbatim):
  - "Now go ahead and {next_subtask}."
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):
 Output strictly valid JSON:
  {{
-    "interjection": "<single sentence the user says, asking for the next subtask>",
+    "interjection": "<short cue from the user, asking for the next subtask>",
-    "speech":       "<single sentence the robot speaks back, confirming and starting>"
+    "speech":       "<short robot acknowledgement>"
  }}
@@ -149,6 +149,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
    )
    print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
    # Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
    # resolves the dataset revision via ``get_safe_version`` which scans
    # for tags like ``v3.0``; without a tag it raises
    # ``RevisionNotFoundError``. Read the version straight from the
    # dataset's own ``meta/info.json`` so we tag whatever the writer
    # actually wrote (no accidental drift if the codebase floor moves).
    from lerobot.datasets.dataset_metadata import CODEBASE_VERSION  # noqa: PLC0415
    info_path = root / "meta" / "info.json"
    version_tag = CODEBASE_VERSION
    if info_path.exists():
        try:
            from lerobot.utils.io_utils import load_json  # noqa: PLC0415
            info = load_json(info_path)
            ds_version = info.get("codebase_version")
            if isinstance(ds_version, str) and ds_version.startswith("v"):
                version_tag = ds_version
        except Exception as exc:  # noqa: BLE001
            print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
    try:
        api.create_tag(
            repo_id=repo_id,
            tag=version_tag,
            repo_type="dataset",
            exist_ok=True,
        )
        print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
    except Exception as exc:  # noqa: BLE001
        print(
            f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
            "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
            "Run: from huggingface_hub import HfApi; "
            f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
            flush=True,
        )
 def main() -> None:
    annotate()
@@ -80,7 +80,6 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
                    {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
                ]
            },
            "concise hierarchical PLAN": {"plan": "1. grasp\n2. wipe\n3. place"},
            "Update the memory": {"memory": "wiped the counter once"},
        },
    )
@@ -96,10 +95,16 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
    frame_set = set(record.frame_timestamps)
    for row in rows:
        assert row["timestamp"] in frame_set
-    # exactly one plan row at t0
+    # one plan row per subtask boundary; the first lands at t0 and each
-    plan_rows = [r for r in rows if r["style"] == "plan"]
+    # plan is the deterministic numbered list of still-todo subtasks
-    assert len(plan_rows) == 1
+    plan_rows = sorted((r for r in rows if r["style"] == "plan"), key=lambda r: r["timestamp"])
    subtask_rows = [r for r in rows if r["style"] == "subtask"]
    assert len(plan_rows) == len(subtask_rows)
    assert plan_rows[0]["timestamp"] == record.frame_timestamps[0]
    # the t0 plan enumerates all subtasks; later plans shrink
    assert plan_rows[0]["content"].startswith("1. ")
    assert len(plan_rows[0]["content"].splitlines()) == len(subtask_rows)
    assert len(plan_rows[-1]["content"].splitlines()) == 1
 def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: Path, tmp_path: Path) -> None: