revert(annotate): move pipeline changes to base PR (#3471)

The deterministic-plan rewrite, single-frame VQA (K 3->1), dataset version tagging, telegraphic-subtask prompt and shorter interjection prompt belong in the annotation pipeline itself, not in the SmolVLA training PR. They have been applied to feat/language-annotation- pipeline (#3471). Reverting these six files here to the merge-base so they drop out of this PR's diff; #3491 will inherit the canonical versions when it next rebases on its base. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 02:36:11 +00:00 · 2026-05-19 14:07:23 +02:00
parent bb31988915
commit 182f10184f
6 changed files with 90 additions and 130 deletions
@@ -114,14 +114,7 @@ class Module3Config:
    enabled: bool = True
    vqa_emission_hz: float = 1.0
-    K: int = 1
+    K: int = 3
    """How many *consecutive* frames each emission tick anchors a VQA pair
    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
    against the *first* anchored frame's image, so anchoring K>1 frames
    copies that same answer onto later frames where the scene has already
    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
    emission frame, no temporal smear. Raise it only to trade label
    precision for more (noisier) VQA frames."""
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
@@ -116,29 +116,18 @@ class PlanSubtasksMemoryModule:
                    "tool_calls": None,
                }
            )
-        # Plan rows at every subtask boundary — including t=0 (start of
+        # plan row at t=0
-        # the first subtask). Because the plan is just a numbered list
+        plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
-        # of *still-todo* subtasks, re-emitting at each boundary makes
+        if plan_text is not None:
-        # the active plan shrink as work progresses: at frame t the
+            rows.append(
-        # rendered ``${plan}`` is the most recent emission, which
+                {
-        # contains exactly the subtasks that started at or after the
+                    "role": "assistant",
-        # current span. Saves the runtime from having to derive
+                    "content": plan_text,
-        # "what's still left" at inference time.
+                    "style": "plan",
-        for span in subtask_spans:
+                    "timestamp": float(t0),
-            boundary_t = _snap_to_frame(span["start"], record.frame_timestamps)
+                    "tool_calls": None,
-            plan_text = self._generate_plan(
+                }
                record, subtask_spans, refresh_t=boundary_t, task=effective_task
            )
            if plan_text is not None:
                rows.append(
                    {
                        "role": "assistant",
                        "content": plan_text,
                        "style": "plan",
                        "timestamp": float(boundary_t),
                        "tool_calls": None,
                    }
                )
        # memory rows at every subtask boundary except the very first start
        prior_memory = ""
        for i, span in enumerate(subtask_spans[1:], start=1):
@@ -383,50 +372,54 @@ class PlanSubtasksMemoryModule:
    def _generate_plan(
        self,
-        record: EpisodeRecord,  # noqa: ARG002  (kept for signature stability)
+        record: EpisodeRecord,
        subtask_spans: Sequence[dict[str, Any]],
        *,
        refresh_t: float | None = None,
-        interjection: str | None = None,  # noqa: ARG002
+        interjection: str | None = None,
-        task: str | None = None,  # noqa: ARG002
+        task: str | None = None,
    ) -> str | None:
        """Deterministic plan = numbered list of *still-todo* subtasks.
        Previously this called the VLM with a prompt that asked it to
        compress the subtasks into a "compact hierarchical plan". That
        produced longer-than-necessary plans, cost an extra VLM round-trip
        per episode (plus one per interjection on refresh), and could
        diverge from the actual subtask sequence the model is going to
        execute. Replacing it with a plain summarisation keeps the plan
        tightly aligned with the upcoming subtasks and removes the VLM
        call entirely.
        Layout (matches the v2 plan style — short imperative fragments
        prefixed by "N. "):
            1. <subtask 1>
            2. <subtask 2>
            ...
        On a refresh at ``refresh_t`` (called from ``run_plan_updates``
        on interjection events), only subtasks whose start is at or
        after ``refresh_t`` are included — the plan shrinks as work
        progresses, so it always describes what's left.
        """
        if not subtask_spans:
            return None
-        remaining = [
+        subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
-            s for s in subtask_spans
+        prompt = load_prompt("module_1_plan").format(
-            if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
+            episode_task=(task if task is not None else record.episode_task),
-        ]
+            subtasks_text=subtasks_text,
-        if not remaining:
+            plan_max_steps=self.config.plan_max_steps,
            # Past the last subtask boundary on a late refresh — nothing
            # left to plan; emit None so the caller skips the row.
            return None
        return "\n".join(
            f"{i}. {span.get('text', '').strip()}"
            for i, span in enumerate(remaining, start=1)
        )
        if refresh_t is not None:
            # ``current_subtask`` is the span the refresh time falls into,
            # so the model knows where in the demonstration the planner is
            # standing when it re-emits.
            current_subtask = ""
            for span in subtask_spans:
                if float(span["start"]) <= refresh_t and (
                    "end" not in span or float(span["end"]) > refresh_t
                ):
                    current_subtask = span.get("text", "")
                    break
            if interjection:
                prompt += (
                    f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
                    f"interjection: {interjection!r}. Current subtask just "
                    f"before the interjection: {current_subtask!r}. Update "
                    f"the plan so it reflects the interjection — drop or "
                    f"reorder steps as needed; do not just restate.)\n"
                )
            else:
                # Refresh without an interjection text: still tell the model
                # where in the episode the plan stands so the re-emission
                # is grounded. Should be rare — plan refreshes are
                # interjection-driven by design.
                prompt += (
                    f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current "
                    f"subtask: {current_subtask!r}.)\n"
                )
        messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
        result = self.vlm.generate_json([messages])[0]
        if isinstance(result, dict) and isinstance(result.get("plan"), str):
            return result["plan"].strip()
        return None
    def _generate_memory(
        self,
@@ -0,0 +1,18 @@
 You are the high-level planner for a robot demonstrating: "{episode_task}".
 Given the subtask decomposition below, write a concise hierarchical PLAN
 the robot should follow. Format the plan as a numbered list, one line per
 high-level step. The plan describes the full task; subtasks are the atomic
 skills used to execute it.
 Subtasks for context:
 {subtasks_text}
 Authoring rules:
 - 3 to {plan_max_steps} steps.
 - Each step describes one logical chunk of the task, not one motion.
 - Steps must be in execution order.
 - Plain prose, no JSON, no markdown headers.
 Output strictly valid JSON:
  {{ "plan": "1. ...\n2. ...\n3. ..." }}
@@ -4,24 +4,17 @@ The user originally asked: "{episode_task}"
 You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
-the robot performs. Write **telegraphic** action labels.
+the robot performs.
-Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
+Authoring rules — based on Hi Robot (Shi 2025) atom granularity and
 Pi0.7 (Physical Intelligence 2025) "how, not what" detail:
- Each subtask = one atomic skill the low-level policy can execute.
+- Each subtask is one atomic skill the low-level policy can execute,
- **Hard length cap: ≤ 4 words.** Ideally 2-3. Form: VERB + (color) +
+  e.g. "pick up one piece of lettuce", "place the bowl into the box",
-  OBJECT. No articles ("the", "a"), no destinations, no adverbs, no
+  "move the right arm to the left".
-  "robot"/"arm"/"gripper" — those are implied.
+- Capture HOW the subtask is performed, not only WHAT — e.g. prefer
- **Use the exact object nouns from the task above.** If the task says
+  "grasp the handle of the sponge with the left hand" to "pick up the
-  "cube", every subtask says "cube" — never switch to "block". If it
+  sponge".
  says "box", never switch to "bin"/"container". Consistent vocabulary
  across the whole episode.
 - Good: "move to blue cube", "grasp blue cube", "lift blue cube",
  "place blue cube", "open drawer", "release yellow cube".
 - Bad: "release the yellow block into the green bin" (articles,
  destination, "block" instead of "cube"), "the robot arm moves
  towards the blue cube" ("the robot arm", too long), "carefully
  pick up the cube" (adverb, article).
 - Subtasks are non-overlapping and cover the full episode in order.
  Choose the cut points yourself based on what you see in the video
  (gripper open/close events, contact, regrasps, transitions).
@@ -34,7 +27,7 @@ Output strictly valid JSON of shape:
  {{
    "subtasks": [
-      {{"text": "<≤4-word verb phrase>", "start": <float>, "end": <float>}},
+      {{"text": "<how-not-what>", "start": <float>, "end": <float>}},
      ...
    ]
  }}
@@ -14,10 +14,12 @@ subtask boundary in the demonstration:
 - Subtask the robot is about to start: "{next_subtask}"
 - Time into episode: {timestamp:.2f}s
-Write ONE compact interjection the user would naturally say at this
+Write ONE interjection the user would naturally say at this moment to
-moment to prompt / confirm / encourage the robot to do "{next_subtask}".
+prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
-Keep it like a mid-task coaching cue, not a full instruction paragraph.
+like a real human mid-task remark — conversational, varied, sometimes
-Also write the robot's compact verbal acknowledgement.
+just a nudge, sometimes a clarification, sometimes a small constraint
 that the upcoming motion happens to satisfy. Plus the robot's verbal
 acknowledgement.
 Hard rules:
@@ -27,9 +29,7 @@ Hard rules:
  instead", DO NOT — those would contradict the demonstration.
 - The interjection must reference an object, location, or action that
  is plausible given the visible scene and the next subtask text.
- One short phrase or sentence each. Conversational, not robotic.
+- One sentence each. Conversational, not robotic.
 - Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
 - Keep robot speech very short: "OK.", "On it.", "Doing that."
 Style examples (vary the phrasing — don't reuse these verbatim):
  - "Now go ahead and {next_subtask}."
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):
 Output strictly valid JSON:
  {{
-    "interjection": "<short cue from the user, asking for the next subtask>",
+    "interjection": "<single sentence the user says, asking for the next subtask>",
-    "speech":       "<short robot acknowledgement>"
+    "speech":       "<single sentence the robot speaks back, confirming and starting>"
  }}
@@ -141,43 +141,6 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
    )
    print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
    # Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
    # resolves the dataset revision via ``get_safe_version`` which scans
    # for tags like ``v3.0``; without a tag it raises
    # ``RevisionNotFoundError``. Read the version straight from the
    # dataset's own ``meta/info.json`` so we tag whatever the writer
    # actually wrote (no accidental drift if the codebase floor moves).
    from lerobot.datasets.dataset_metadata import CODEBASE_VERSION  # noqa: PLC0415
    info_path = root / "meta" / "info.json"
    version_tag = CODEBASE_VERSION
    if info_path.exists():
        try:
            from lerobot.utils.io_utils import load_json  # noqa: PLC0415
            info = load_json(info_path)
            ds_version = info.get("codebase_version")
            if isinstance(ds_version, str) and ds_version.startswith("v"):
                version_tag = ds_version
        except Exception as exc:  # noqa: BLE001
            print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
    try:
        api.create_tag(
            repo_id=repo_id,
            tag=version_tag,
            repo_type="dataset",
            exist_ok=True,
        )
        print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
    except Exception as exc:  # noqa: BLE001
        print(
            f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
            "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
            "Run: from huggingface_hub import HfApi; "
            f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
            flush=True,
        )
 def main() -> None:
    annotate()