revert(annotate): move pipeline changes to base PR (#3471)

The deterministic-plan rewrite, single-frame VQA (K 3->1), dataset version tagging, telegraphic-subtask prompt and shorter interjection prompt belong in the annotation pipeline itself, not in the SmolVLA training PR. They have been applied to feat/language-annotation- pipeline (#3471). Reverting these six files here to the merge-base so they drop out of this PR's diff; #3491 will inherit the canonical versions when it next rebases on its base. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 02:59:50 +00:00 · 2026-05-19 14:07:23 +02:00
parent bb31988915
commit 182f10184f
6 changed files with 90 additions and 130 deletions
@@ -114,14 +114,7 @@ class Module3Config:

    enabled: bool = True
    vqa_emission_hz: float = 1.0
-    K: int = 1
-    """How many *consecutive* frames each emission tick anchors a VQA pair
-    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
-    against the *first* anchored frame's image, so anchoring K>1 frames
-    copies that same answer onto later frames where the scene has already
-    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
-    emission frame, no temporal smear. Raise it only to trade label
-    precision for more (noisier) VQA frames."""
+    K: int = 3
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")


@@ -116,29 +116,18 @@ class PlanSubtasksMemoryModule:
                    "tool_calls": None,
                }
            )
-        # Plan rows at every subtask boundary — including t=0 (start of
-        # the first subtask). Because the plan is just a numbered list
-        # of *still-todo* subtasks, re-emitting at each boundary makes
-        # the active plan shrink as work progresses: at frame t the
-        # rendered ``${plan}`` is the most recent emission, which
-        # contains exactly the subtasks that started at or after the
-        # current span. Saves the runtime from having to derive
-        # "what's still left" at inference time.
-        for span in subtask_spans:
-            boundary_t = _snap_to_frame(span["start"], record.frame_timestamps)
-            plan_text = self._generate_plan(
-                record, subtask_spans, refresh_t=boundary_t, task=effective_task
+        # plan row at t=0
+        plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
+        if plan_text is not None:
+            rows.append(
+                {
+                    "role": "assistant",
+                    "content": plan_text,
+                    "style": "plan",
+                    "timestamp": float(t0),
+                    "tool_calls": None,
+                }
            )
-            if plan_text is not None:
-                rows.append(
-                    {
-                        "role": "assistant",
-                        "content": plan_text,
-                        "style": "plan",
-                        "timestamp": float(boundary_t),
-                        "tool_calls": None,
-                    }
-                )
        # memory rows at every subtask boundary except the very first start
        prior_memory = ""
        for i, span in enumerate(subtask_spans[1:], start=1):
@@ -383,50 +372,54 @@ class PlanSubtasksMemoryModule:

    def _generate_plan(
        self,
-        record: EpisodeRecord,  # noqa: ARG002  (kept for signature stability)
+        record: EpisodeRecord,
        subtask_spans: Sequence[dict[str, Any]],
        *,
        refresh_t: float | None = None,
-        interjection: str | None = None,  # noqa: ARG002
-        task: str | None = None,  # noqa: ARG002
+        interjection: str | None = None,
+        task: str | None = None,
    ) -> str | None:
-        """Deterministic plan = numbered list of *still-todo* subtasks.
-
-        Previously this called the VLM with a prompt that asked it to
-        compress the subtasks into a "compact hierarchical plan". That
-        produced longer-than-necessary plans, cost an extra VLM round-trip
-        per episode (plus one per interjection on refresh), and could
-        diverge from the actual subtask sequence the model is going to
-        execute. Replacing it with a plain summarisation keeps the plan
-        tightly aligned with the upcoming subtasks and removes the VLM
-        call entirely.
-
-        Layout (matches the v2 plan style — short imperative fragments
-        prefixed by "N. "):
-
-            1. <subtask 1>
-            2. <subtask 2>
-            ...
-
-        On a refresh at ``refresh_t`` (called from ``run_plan_updates``
-        on interjection events), only subtasks whose start is at or
-        after ``refresh_t`` are included — the plan shrinks as work
-        progresses, so it always describes what's left.
-        """
        if not subtask_spans:
            return None
-        remaining = [
-            s for s in subtask_spans
-            if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
-        ]
-        if not remaining:
-            # Past the last subtask boundary on a late refresh — nothing
-            # left to plan; emit None so the caller skips the row.
-            return None
-        return "\n".join(
-            f"{i}. {span.get('text', '').strip()}"
-            for i, span in enumerate(remaining, start=1)
+        subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
+        prompt = load_prompt("module_1_plan").format(
+            episode_task=(task if task is not None else record.episode_task),
+            subtasks_text=subtasks_text,
+            plan_max_steps=self.config.plan_max_steps,
        )
+        if refresh_t is not None:
+            # ``current_subtask`` is the span the refresh time falls into,
+            # so the model knows where in the demonstration the planner is
+            # standing when it re-emits.
+            current_subtask = ""
+            for span in subtask_spans:
+                if float(span["start"]) <= refresh_t and (
+                    "end" not in span or float(span["end"]) > refresh_t
+                ):
+                    current_subtask = span.get("text", "")
+                    break
+            if interjection:
+                prompt += (
+                    f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
+                    f"interjection: {interjection!r}. Current subtask just "
+                    f"before the interjection: {current_subtask!r}. Update "
+                    f"the plan so it reflects the interjection — drop or "
+                    f"reorder steps as needed; do not just restate.)\n"
+                )
+            else:
+                # Refresh without an interjection text: still tell the model
+                # where in the episode the plan stands so the re-emission
+                # is grounded. Should be rare — plan refreshes are
+                # interjection-driven by design.
+                prompt += (
+                    f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current "
+                    f"subtask: {current_subtask!r}.)\n"
+                )
+        messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+        result = self.vlm.generate_json([messages])[0]
+        if isinstance(result, dict) and isinstance(result.get("plan"), str):
+            return result["plan"].strip()
+        return None

    def _generate_memory(
        self,
@@ -0,0 +1,18 @@
+You are the high-level planner for a robot demonstrating: "{episode_task}".
+
+Given the subtask decomposition below, write a concise hierarchical PLAN
+the robot should follow. Format the plan as a numbered list, one line per
+high-level step. The plan describes the full task; subtasks are the atomic
+skills used to execute it.
+
+Subtasks for context:
+{subtasks_text}
+
+Authoring rules:
+- 3 to {plan_max_steps} steps.
+- Each step describes one logical chunk of the task, not one motion.
+- Steps must be in execution order.
+- Plain prose, no JSON, no markdown headers.
+
+Output strictly valid JSON:
+  {{ "plan": "1. ...\n2. ...\n3. ..." }}
@@ -4,24 +4,17 @@ The user originally asked: "{episode_task}"

 You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
-the robot performs. Write **telegraphic** action labels.
+the robot performs.

-Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
+Authoring rules — based on Hi Robot (Shi 2025) atom granularity and
+Pi0.7 (Physical Intelligence 2025) "how, not what" detail:

- Each subtask = one atomic skill the low-level policy can execute.
- **Hard length cap: ≤ 4 words.** Ideally 2-3. Form: VERB + (color) +
-  OBJECT. No articles ("the", "a"), no destinations, no adverbs, no
-  "robot"/"arm"/"gripper" — those are implied.
- **Use the exact object nouns from the task above.** If the task says
-  "cube", every subtask says "cube" — never switch to "block". If it
-  says "box", never switch to "bin"/"container". Consistent vocabulary
-  across the whole episode.
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
-  "place blue cube", "open drawer", "release yellow cube".
- Bad: "release the yellow block into the green bin" (articles,
-  destination, "block" instead of "cube"), "the robot arm moves
-  towards the blue cube" ("the robot arm", too long), "carefully
-  pick up the cube" (adverb, article).
+- Each subtask is one atomic skill the low-level policy can execute,
+  e.g. "pick up one piece of lettuce", "place the bowl into the box",
+  "move the right arm to the left".
+- Capture HOW the subtask is performed, not only WHAT — e.g. prefer
+  "grasp the handle of the sponge with the left hand" to "pick up the
+  sponge".
 - Subtasks are non-overlapping and cover the full episode in order.
  Choose the cut points yourself based on what you see in the video
  (gripper open/close events, contact, regrasps, transitions).
@@ -34,7 +27,7 @@ Output strictly valid JSON of shape:

  {{
    "subtasks": [
-      {{"text": "<≤4-word verb phrase>", "start": <float>, "end": <float>}},
+      {{"text": "<how-not-what>", "start": <float>, "end": <float>}},
      ...
    ]
  }}
@@ -14,10 +14,12 @@ subtask boundary in the demonstration:
 - Subtask the robot is about to start: "{next_subtask}"
 - Time into episode: {timestamp:.2f}s

-Write ONE compact interjection the user would naturally say at this
-moment to prompt / confirm / encourage the robot to do "{next_subtask}".
-Keep it like a mid-task coaching cue, not a full instruction paragraph.
-Also write the robot's compact verbal acknowledgement.
+Write ONE interjection the user would naturally say at this moment to
+prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
+like a real human mid-task remark — conversational, varied, sometimes
+just a nudge, sometimes a clarification, sometimes a small constraint
+that the upcoming motion happens to satisfy. Plus the robot's verbal
+acknowledgement.

 Hard rules:

@@ -27,9 +29,7 @@ Hard rules:
  instead", DO NOT — those would contradict the demonstration.
 - The interjection must reference an object, location, or action that
  is plausible given the visible scene and the next subtask text.
- One short phrase or sentence each. Conversational, not robotic.
- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
- Keep robot speech very short: "OK.", "On it.", "Doing that."
+- One sentence each. Conversational, not robotic.

 Style examples (vary the phrasing — don't reuse these verbatim):
  - "Now go ahead and {next_subtask}."
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):

 Output strictly valid JSON:
  {{
-    "interjection": "<short cue from the user, asking for the next subtask>",
-    "speech":       "<short robot acknowledgement>"
+    "interjection": "<single sentence the user says, asking for the next subtask>",
+    "speech":       "<single sentence the robot speaks back, confirming and starting>"
  }}
@@ -141,43 +141,6 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
    )
    print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)

-    # Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
-    # resolves the dataset revision via ``get_safe_version`` which scans
-    # for tags like ``v3.0``; without a tag it raises
-    # ``RevisionNotFoundError``. Read the version straight from the
-    # dataset's own ``meta/info.json`` so we tag whatever the writer
-    # actually wrote (no accidental drift if the codebase floor moves).
-    from lerobot.datasets.dataset_metadata import CODEBASE_VERSION  # noqa: PLC0415
-
-    info_path = root / "meta" / "info.json"
-    version_tag = CODEBASE_VERSION
-    if info_path.exists():
-        try:
-            from lerobot.utils.io_utils import load_json  # noqa: PLC0415
-
-            info = load_json(info_path)
-            ds_version = info.get("codebase_version")
-            if isinstance(ds_version, str) and ds_version.startswith("v"):
-                version_tag = ds_version
-        except Exception as exc:  # noqa: BLE001
-            print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
-    try:
-        api.create_tag(
-            repo_id=repo_id,
-            tag=version_tag,
-            repo_type="dataset",
-            exist_ok=True,
-        )
-        print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
-    except Exception as exc:  # noqa: BLE001
-        print(
-            f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
-            "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
-            "Run: from huggingface_hub import HfApi; "
-            f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
-            flush=True,
-        )
-

 def main() -> None:
    annotate()