revert(annotate): move pipeline changes to base PR (#3471)

The deterministic-plan rewrite, single-frame VQA (K 3->1), dataset
version tagging, telegraphic-subtask prompt and shorter interjection
prompt belong in the annotation pipeline itself, not in the SmolVLA
training PR. They have been applied to feat/language-annotation-
pipeline (#3471). Reverting these six files here to the merge-base so
they drop out of this PR's diff; #3491 will inherit the canonical
versions when it next rebases on its base.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-19 14:07:23 +02:00
parent bb31988915
commit 182f10184f
6 changed files with 90 additions and 130 deletions
@@ -114,14 +114,7 @@ class Module3Config:
enabled: bool = True
vqa_emission_hz: float = 1.0
K: int = 1
"""How many *consecutive* frames each emission tick anchors a VQA pair
to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
against the *first* anchored frame's image, so anchoring K>1 frames
copies that same answer onto later frames where the scene has already
moved — stale labels. Default ``1``: a VQA pair lands on exactly its
emission frame, no temporal smear. Raise it only to trade label
precision for more (noisier) VQA frames."""
K: int = 3
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
@@ -116,29 +116,18 @@ class PlanSubtasksMemoryModule:
"tool_calls": None,
}
)
# Plan rows at every subtask boundary — including t=0 (start of
# the first subtask). Because the plan is just a numbered list
# of *still-todo* subtasks, re-emitting at each boundary makes
# the active plan shrink as work progresses: at frame t the
# rendered ``${plan}`` is the most recent emission, which
# contains exactly the subtasks that started at or after the
# current span. Saves the runtime from having to derive
# "what's still left" at inference time.
for span in subtask_spans:
boundary_t = _snap_to_frame(span["start"], record.frame_timestamps)
plan_text = self._generate_plan(
record, subtask_spans, refresh_t=boundary_t, task=effective_task
# plan row at t=0
plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
if plan_text is not None:
rows.append(
{
"role": "assistant",
"content": plan_text,
"style": "plan",
"timestamp": float(t0),
"tool_calls": None,
}
)
if plan_text is not None:
rows.append(
{
"role": "assistant",
"content": plan_text,
"style": "plan",
"timestamp": float(boundary_t),
"tool_calls": None,
}
)
# memory rows at every subtask boundary except the very first start
prior_memory = ""
for i, span in enumerate(subtask_spans[1:], start=1):
@@ -383,50 +372,54 @@ class PlanSubtasksMemoryModule:
def _generate_plan(
self,
record: EpisodeRecord, # noqa: ARG002 (kept for signature stability)
record: EpisodeRecord,
subtask_spans: Sequence[dict[str, Any]],
*,
refresh_t: float | None = None,
interjection: str | None = None, # noqa: ARG002
task: str | None = None, # noqa: ARG002
interjection: str | None = None,
task: str | None = None,
) -> str | None:
"""Deterministic plan = numbered list of *still-todo* subtasks.
Previously this called the VLM with a prompt that asked it to
compress the subtasks into a "compact hierarchical plan". That
produced longer-than-necessary plans, cost an extra VLM round-trip
per episode (plus one per interjection on refresh), and could
diverge from the actual subtask sequence the model is going to
execute. Replacing it with a plain summarisation keeps the plan
tightly aligned with the upcoming subtasks and removes the VLM
call entirely.
Layout (matches the v2 plan style — short imperative fragments
prefixed by "N. "):
1. <subtask 1>
2. <subtask 2>
...
On a refresh at ``refresh_t`` (called from ``run_plan_updates``
on interjection events), only subtasks whose start is at or
after ``refresh_t`` are included — the plan shrinks as work
progresses, so it always describes what's left.
"""
if not subtask_spans:
return None
remaining = [
s for s in subtask_spans
if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
]
if not remaining:
# Past the last subtask boundary on a late refresh — nothing
# left to plan; emit None so the caller skips the row.
return None
return "\n".join(
f"{i}. {span.get('text', '').strip()}"
for i, span in enumerate(remaining, start=1)
subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
prompt = load_prompt("module_1_plan").format(
episode_task=(task if task is not None else record.episode_task),
subtasks_text=subtasks_text,
plan_max_steps=self.config.plan_max_steps,
)
if refresh_t is not None:
# ``current_subtask`` is the span the refresh time falls into,
# so the model knows where in the demonstration the planner is
# standing when it re-emits.
current_subtask = ""
for span in subtask_spans:
if float(span["start"]) <= refresh_t and (
"end" not in span or float(span["end"]) > refresh_t
):
current_subtask = span.get("text", "")
break
if interjection:
prompt += (
f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
f"interjection: {interjection!r}. Current subtask just "
f"before the interjection: {current_subtask!r}. Update "
f"the plan so it reflects the interjection — drop or "
f"reorder steps as needed; do not just restate.)\n"
)
else:
# Refresh without an interjection text: still tell the model
# where in the episode the plan stands so the re-emission
# is grounded. Should be rare — plan refreshes are
# interjection-driven by design.
prompt += (
f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current "
f"subtask: {current_subtask!r}.)\n"
)
messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
result = self.vlm.generate_json([messages])[0]
if isinstance(result, dict) and isinstance(result.get("plan"), str):
return result["plan"].strip()
return None
def _generate_memory(
self,
@@ -0,0 +1,18 @@
You are the high-level planner for a robot demonstrating: "{episode_task}".
Given the subtask decomposition below, write a concise hierarchical PLAN
the robot should follow. Format the plan as a numbered list, one line per
high-level step. The plan describes the full task; subtasks are the atomic
skills used to execute it.
Subtasks for context:
{subtasks_text}
Authoring rules:
- 3 to {plan_max_steps} steps.
- Each step describes one logical chunk of the task, not one motion.
- Steps must be in execution order.
- Plain prose, no JSON, no markdown headers.
Output strictly valid JSON:
{{ "plan": "1. ...\n2. ...\n3. ..." }}
@@ -4,24 +4,17 @@ The user originally asked: "{episode_task}"
You are shown the entire demonstration as a single video. Watch the
whole clip, then segment it into a list of consecutive atomic subtasks
the robot performs. Write **telegraphic** action labels.
the robot performs.
Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
Authoring rules — based on Hi Robot (Shi 2025) atom granularity and
Pi0.7 (Physical Intelligence 2025) "how, not what" detail:
- Each subtask = one atomic skill the low-level policy can execute.
- **Hard length cap: ≤ 4 words.** Ideally 2-3. Form: VERB + (color) +
OBJECT. No articles ("the", "a"), no destinations, no adverbs, no
"robot"/"arm"/"gripper" — those are implied.
- **Use the exact object nouns from the task above.** If the task says
"cube", every subtask says "cube" — never switch to "block". If it
says "box", never switch to "bin"/"container". Consistent vocabulary
across the whole episode.
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
"place blue cube", "open drawer", "release yellow cube".
- Bad: "release the yellow block into the green bin" (articles,
destination, "block" instead of "cube"), "the robot arm moves
towards the blue cube" ("the robot arm", too long), "carefully
pick up the cube" (adverb, article).
- Each subtask is one atomic skill the low-level policy can execute,
e.g. "pick up one piece of lettuce", "place the bowl into the box",
"move the right arm to the left".
- Capture HOW the subtask is performed, not only WHAT — e.g. prefer
"grasp the handle of the sponge with the left hand" to "pick up the
sponge".
- Subtasks are non-overlapping and cover the full episode in order.
Choose the cut points yourself based on what you see in the video
(gripper open/close events, contact, regrasps, transitions).
@@ -34,7 +27,7 @@ Output strictly valid JSON of shape:
{{
"subtasks": [
{{"text": "<≤4-word verb phrase>", "start": <float>, "end": <float>}},
{{"text": "<how-not-what>", "start": <float>, "end": <float>}},
...
]
}}
@@ -14,10 +14,12 @@ subtask boundary in the demonstration:
- Subtask the robot is about to start: "{next_subtask}"
- Time into episode: {timestamp:.2f}s
Write ONE compact interjection the user would naturally say at this
moment to prompt / confirm / encourage the robot to do "{next_subtask}".
Keep it like a mid-task coaching cue, not a full instruction paragraph.
Also write the robot's compact verbal acknowledgement.
Write ONE interjection the user would naturally say at this moment to
prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
like a real human mid-task remark — conversational, varied, sometimes
just a nudge, sometimes a clarification, sometimes a small constraint
that the upcoming motion happens to satisfy. Plus the robot's verbal
acknowledgement.
Hard rules:
@@ -27,9 +29,7 @@ Hard rules:
instead", DO NOT — those would contradict the demonstration.
- The interjection must reference an object, location, or action that
is plausible given the visible scene and the next subtask text.
- One short phrase or sentence each. Conversational, not robotic.
- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
- Keep robot speech very short: "OK.", "On it.", "Doing that."
- One sentence each. Conversational, not robotic.
Style examples (vary the phrasing — don't reuse these verbatim):
- "Now go ahead and {next_subtask}."
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):
Output strictly valid JSON:
{{
"interjection": "<short cue from the user, asking for the next subtask>",
"speech": "<short robot acknowledgement>"
"interjection": "<single sentence the user says, asking for the next subtask>",
"speech": "<single sentence the robot speaks back, confirming and starting>"
}}
-37
View File
@@ -141,43 +141,6 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
)
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
# resolves the dataset revision via ``get_safe_version`` which scans
# for tags like ``v3.0``; without a tag it raises
# ``RevisionNotFoundError``. Read the version straight from the
# dataset's own ``meta/info.json`` so we tag whatever the writer
# actually wrote (no accidental drift if the codebase floor moves).
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
info_path = root / "meta" / "info.json"
version_tag = CODEBASE_VERSION
if info_path.exists():
try:
from lerobot.utils.io_utils import load_json # noqa: PLC0415
info = load_json(info_path)
ds_version = info.get("codebase_version")
if isinstance(ds_version, str) and ds_version.startswith("v"):
version_tag = ds_version
except Exception as exc: # noqa: BLE001
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
try:
api.create_tag(
repo_id=repo_id,
tag=version_tag,
repo_type="dataset",
exist_ok=True,
)
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
except Exception as exc: # noqa: BLE001
print(
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
"Run: from huggingface_hub import HfApi; "
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
flush=True,
)
def main() -> None:
annotate()