feat(annotate): deterministic plan, single-frame VQA, dataset tagging

Port the steerable-pipeline refinements developed on feat/smolvla-on- steerable back into the annotation pipeline itself: - module_1_subtasks: imperative verb-first telegraphic labels with a consistent-object-noun rule and good/bad examples (no hard word cap). - _generate_plan: drop the VLM round-trip; the plan is now a deterministic numbered list of still-todo subtasks, re-emitted at every subtask boundary so it shrinks as work progresses. Removes module_1_plan.txt. - VqaConfig.K 3 -> 1: a VQA pair anchors exactly its emission frame, no stale-label temporal smear. - lerobot-annotate: tag the pushed dataset with its codebase_version so LeRobotDataset can resolve a revision and load it. - module_2_interjection: shorter, more natural mid-task cues. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 19:19:56 +00:00 · 2026-05-19 14:06:15 +02:00
parent 26013da699
commit ce47075d6b
7 changed files with 144 additions and 101 deletions
@@ -83,7 +83,14 @@ class VqaConfig:

    enabled: bool = True
    vqa_emission_hz: float = 1.0
-    K: int = 3
+    K: int = 1
+    """How many *consecutive* frames each emission tick anchors a VQA pair
+    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
+    against the *first* anchored frame's image, so anchoring K>1 frames
+    copies that same answer onto later frames where the scene has already
+    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
+    emission frame, no temporal smear. Raise it only to trade label
+    precision for more (noisier) VQA frames."""
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")


@@ -104,18 +104,29 @@ class PlanSubtasksMemoryModule:
                    "tool_calls": None,
                }
            )
-        # plan row at t=0
-        plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
-        if plan_text is not None:
-            rows.append(
-                {
-                    "role": "assistant",
-                    "content": plan_text,
-                    "style": "plan",
-                    "timestamp": float(t0),
-                    "tool_calls": None,
-                }
+        # Plan rows at every subtask boundary — including t=0 (start of
+        # the first subtask). Because the plan is just a numbered list
+        # of *still-todo* subtasks, re-emitting at each boundary makes
+        # the active plan shrink as work progresses: at frame t the
+        # rendered ``${plan}`` is the most recent emission, which
+        # contains exactly the subtasks that started at or after the
+        # current span. Saves the runtime from having to derive
+        # "what's still left" at inference time.
+        for span in subtask_spans:
+            boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
+            plan_text = self._generate_plan(
+                record, subtask_spans, refresh_t=boundary_t, task=effective_task
            )
+            if plan_text is not None:
+                rows.append(
+                    {
+                        "role": "assistant",
+                        "content": plan_text,
+                        "style": "plan",
+                        "timestamp": float(boundary_t),
+                        "tool_calls": None,
+                    }
+                )
        # memory rows at every subtask boundary except the very first start
        prior_memory = ""
        for i, span in enumerate(subtask_spans[1:], start=1):
@@ -327,48 +338,50 @@ class PlanSubtasksMemoryModule:

    def _generate_plan(
        self,
-        record: EpisodeRecord,
+        record: EpisodeRecord,  # noqa: ARG002  (kept for signature stability)
        subtask_spans: Sequence[dict[str, Any]],
        *,
        refresh_t: float | None = None,
-        interjection: str | None = None,
-        task: str | None = None,
+        interjection: str | None = None,  # noqa: ARG002
+        task: str | None = None,  # noqa: ARG002
    ) -> str | None:
+        """Deterministic plan = numbered list of *still-todo* subtasks.
+
+        Previously this called the VLM with a prompt that asked it to
+        compress the subtasks into a "compact hierarchical plan". That
+        produced longer-than-necessary plans, cost an extra VLM round-trip
+        per episode (plus one per interjection on refresh), and could
+        diverge from the actual subtask sequence the model is going to
+        execute. Replacing it with a plain summarisation keeps the plan
+        tightly aligned with the upcoming subtasks and removes the VLM
+        call entirely.
+
+        Layout — short imperative fragments prefixed by "N. ":
+
+            1. <subtask 1>
+            2. <subtask 2>
+            ...
+
+        On a refresh at ``refresh_t`` (called from ``run_plan_updates``
+        on interjection events, and from ``run_episode`` at every subtask
+        boundary), only subtasks whose start is at or after ``refresh_t``
+        are included — the plan shrinks as work progresses, so it always
+        describes what's left.
+        """
        if not subtask_spans:
            return None
-        subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
-        prompt = load_prompt("module_1_plan").format(
-            episode_task=(task if task is not None else record.episode_task),
-            subtasks_text=subtasks_text,
-            plan_max_steps=self.config.plan_max_steps,
+        remaining = [
+            s
+            for s in subtask_spans
+            if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
+        ]
+        if not remaining:
+            # Past the last subtask boundary on a late refresh — nothing
+            # left to plan; emit None so the caller skips the row.
+            return None
+        return "\n".join(
+            f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1)
        )
-        if refresh_t is not None:
-            # ``current_subtask`` is the span the refresh time falls into,
-            # so the model knows where in the demonstration the planner is
-            # standing when it re-emits.
-            current_subtask = ""
-            for span in subtask_spans:
-                if float(span["start"]) <= refresh_t and (
-                    "end" not in span or float(span["end"]) > refresh_t
-                ):
-                    current_subtask = span.get("text", "")
-                    break
-            if interjection:
-                prompt += (
-                    f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
-                    f"interjection: {interjection!r}. Current subtask just "
-                    f"before the interjection: {current_subtask!r}. Update "
-                    f"the plan so it reflects the interjection — drop or "
-                    f"reorder steps as needed; do not just restate.)\n"
-                )
-            else:
-                # Refresh without an interjection text: still tell the model
-                # where in the episode the plan stands so the re-emission
-                # is grounded. Should be rare — plan refreshes are
-                # interjection-driven by design.
-                prompt += f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current subtask: {current_subtask!r}.)\n"
-        plan = self._vlm_field(self._text_message(prompt), "plan")
-        return plan.strip() if isinstance(plan, str) else None

    def _generate_memory(
        self,
@@ -1,18 +0,0 @@
-You are the high-level planner for a robot demonstrating: "{episode_task}".
-
-Given the subtask decomposition below, write a concise hierarchical PLAN
-the robot should follow. Format the plan as a numbered list, one line per
-high-level step. The plan describes the full task; subtasks are the atomic
-skills used to execute it.
-
-Subtasks for context:
-{subtasks_text}
-
-Authoring rules:
- 3 to {plan_max_steps} steps.
- Each step describes one logical chunk of the task, not one motion.
- Steps must be in execution order.
- Plain prose, no JSON, no markdown headers.
-
-Output strictly valid JSON:
-  {{ "plan": "1. ...\n2. ...\n3. ..." }}
@@ -4,20 +4,30 @@ The user originally asked: "{episode_task}"

 You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
-the robot performs.
+the robot performs. Write short, telegraphic action labels.

-Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
+Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:

- Each subtask is one atomic skill the low-level policy can execute,
-  e.g. "pick up the orange", "place the bowl into the box".
- Write each subtask as an IMPERATIVE COMMAND to the robot, starting
-  with a verb: move, reach, pick up, grasp, place, put, push, pull,
-  open, close, turn, press, lift, insert, pour...
+- Each subtask = one atomic skill the low-level policy can execute.
+- Write each subtask as an IMPERATIVE COMMAND, starting with a verb:
+  move, reach, pick up, grasp, place, put, push, pull, open, close,
+  turn, press, lift, insert, pour...
+- Keep it SHORT — a verb phrase, not a sentence. Drop articles
+  ("the", "a") and adverbs ("carefully", "slowly"). Add a "how"
+  detail (which hand, which grasp point) ONLY when it is needed to
+  disambiguate.
 - NEVER use third person. Never write "the robot", "the arm", "the
-  gripper moves", "it picks up". Command the robot, do not describe it.
- Keep it SHORT — 3 to 8 words. Add a "how" detail (which hand, which
-  grasp point) ONLY when it is needed to disambiguate.
- Lower-case, no trailing period.
+  gripper moves", "it picks up" — the robot is implied. Command it,
+  do not describe it.
+- Use the exact object nouns from the task above. If the task says
+  "cube", every subtask says "cube" — never switch to "block". If it
+  says "box", never switch to "bin"/"container". Keep vocabulary
+  consistent across the whole episode.
+- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
+  "place blue cube in box", "open drawer", "release yellow cube".
+- Bad: "the robot arm moves towards the blue cube" (third person,
+  too long), "carefully pick up the cube" (adverb, article),
+  "release the yellow block" ("block" when the task said "cube").
 - Subtasks are non-overlapping and cover the full episode in order.
  Choose the cut points yourself based on what you see in the video
  (gripper open/close events, contact, regrasps, transitions).
@@ -26,22 +36,11 @@ Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
 - Every subtask's [start_time, end_time] must lie within
  [0.0, {episode_duration}] seconds.

-Style examples:
-
-  Good                            Bad (do NOT produce these)
-  "pick up the orange"            "the robot arm moves to the orange"
-  "move to the yellow block"      "the gripper approaches the block"
-  "close gripper to grasp         "close the gripper to grasp the
-   the yellow cube"                yellow cube so it can lift it"
-  "open the toaster oven"         "it opens the toaster oven door"
-  "put the bagel on the           "the white plate now has the bagel
-   white plate"                    placed on it by the arm"
-
 Output strictly valid JSON of shape:

  {{
    "subtasks": [
-      {{"text": "<short imperative command>", "start": <float>, "end": <float>}},
+      {{"text": "<short imperative verb phrase>", "start": <float>, "end": <float>}},
      ...
    ]
  }}
@@ -14,12 +14,10 @@ subtask boundary in the demonstration:
 - Subtask the robot is about to start: "{next_subtask}"
 - Time into episode: {timestamp:.2f}s

-Write ONE interjection the user would naturally say at this moment to
-prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
-like a real human mid-task remark — conversational, varied, sometimes
-just a nudge, sometimes a clarification, sometimes a small constraint
-that the upcoming motion happens to satisfy. Plus the robot's verbal
-acknowledgement.
+Write ONE compact interjection the user would naturally say at this
+moment to prompt / confirm / encourage the robot to do "{next_subtask}".
+Keep it like a mid-task coaching cue, not a full instruction paragraph.
+Also write the robot's compact verbal acknowledgement.

 Hard rules:

@@ -29,7 +27,9 @@ Hard rules:
  instead", DO NOT — those would contradict the demonstration.
 - The interjection must reference an object, location, or action that
  is plausible given the visible scene and the next subtask text.
- One sentence each. Conversational, not robotic.
+- One short phrase or sentence each. Conversational, not robotic.
+- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
+- Keep robot speech very short: "OK.", "On it.", "Doing that."

 Style examples (vary the phrasing — don't reuse these verbatim):
  - "Now go ahead and {next_subtask}."
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):

 Output strictly valid JSON:
  {{
-    "interjection": "<single sentence the user says, asking for the next subtask>",
-    "speech":       "<single sentence the robot speaks back, confirming and starting>"
+    "interjection": "<short cue from the user, asking for the next subtask>",
+    "speech":       "<short robot acknowledgement>"
  }}
@@ -149,6 +149,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
    )
    print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)

+    # Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
+    # resolves the dataset revision via ``get_safe_version`` which scans
+    # for tags like ``v3.0``; without a tag it raises
+    # ``RevisionNotFoundError``. Read the version straight from the
+    # dataset's own ``meta/info.json`` so we tag whatever the writer
+    # actually wrote (no accidental drift if the codebase floor moves).
+    from lerobot.datasets.dataset_metadata import CODEBASE_VERSION  # noqa: PLC0415
+
+    info_path = root / "meta" / "info.json"
+    version_tag = CODEBASE_VERSION
+    if info_path.exists():
+        try:
+            from lerobot.utils.io_utils import load_json  # noqa: PLC0415
+
+            info = load_json(info_path)
+            ds_version = info.get("codebase_version")
+            if isinstance(ds_version, str) and ds_version.startswith("v"):
+                version_tag = ds_version
+        except Exception as exc:  # noqa: BLE001
+            print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
+    try:
+        api.create_tag(
+            repo_id=repo_id,
+            tag=version_tag,
+            repo_type="dataset",
+            exist_ok=True,
+        )
+        print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
+    except Exception as exc:  # noqa: BLE001
+        print(
+            f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
+            "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
+            "Run: from huggingface_hub import HfApi; "
+            f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
+            flush=True,
+        )
+

 def main() -> None:
    annotate()
@@ -80,7 +80,6 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
                    {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
                ]
            },
-            "concise hierarchical PLAN": {"plan": "1. grasp\n2. wipe\n3. place"},
            "Update the memory": {"memory": "wiped the counter once"},
        },
    )
@@ -96,10 +95,16 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
    frame_set = set(record.frame_timestamps)
    for row in rows:
        assert row["timestamp"] in frame_set
-    # exactly one plan row at t0
-    plan_rows = [r for r in rows if r["style"] == "plan"]
-    assert len(plan_rows) == 1
+    # one plan row per subtask boundary; the first lands at t0 and each
+    # plan is the deterministic numbered list of still-todo subtasks
+    plan_rows = sorted((r for r in rows if r["style"] == "plan"), key=lambda r: r["timestamp"])
+    subtask_rows = [r for r in rows if r["style"] == "subtask"]
+    assert len(plan_rows) == len(subtask_rows)
    assert plan_rows[0]["timestamp"] == record.frame_timestamps[0]
+    # the t0 plan enumerates all subtasks; later plans shrink
+    assert plan_rows[0]["content"].startswith("1. ")
+    assert len(plan_rows[0]["content"].splitlines()) == len(subtask_rows)
+    assert len(plan_rows[-1]["content"].splitlines()) == 1


 def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: Path, tmp_path: Path) -> None: