mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
revert(annotate): move pipeline changes to base PR (#3471)
The deterministic-plan rewrite, single-frame VQA (K 3->1), dataset version tagging, telegraphic-subtask prompt and shorter interjection prompt belong in the annotation pipeline itself, not in the SmolVLA training PR. They have been applied to feat/language-annotation- pipeline (#3471). Reverting these six files here to the merge-base so they drop out of this PR's diff; #3491 will inherit the canonical versions when it next rebases on its base. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -114,14 +114,7 @@ class Module3Config:
|
|||||||
|
|
||||||
enabled: bool = True
|
enabled: bool = True
|
||||||
vqa_emission_hz: float = 1.0
|
vqa_emission_hz: float = 1.0
|
||||||
K: int = 1
|
K: int = 3
|
||||||
"""How many *consecutive* frames each emission tick anchors a VQA pair
|
|
||||||
to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
|
|
||||||
against the *first* anchored frame's image, so anchoring K>1 frames
|
|
||||||
copies that same answer onto later frames where the scene has already
|
|
||||||
moved — stale labels. Default ``1``: a VQA pair lands on exactly its
|
|
||||||
emission frame, no temporal smear. Raise it only to trade label
|
|
||||||
precision for more (noisier) VQA frames."""
|
|
||||||
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
|
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -116,29 +116,18 @@ class PlanSubtasksMemoryModule:
|
|||||||
"tool_calls": None,
|
"tool_calls": None,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# Plan rows at every subtask boundary — including t=0 (start of
|
# plan row at t=0
|
||||||
# the first subtask). Because the plan is just a numbered list
|
plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
|
||||||
# of *still-todo* subtasks, re-emitting at each boundary makes
|
if plan_text is not None:
|
||||||
# the active plan shrink as work progresses: at frame t the
|
rows.append(
|
||||||
# rendered ``${plan}`` is the most recent emission, which
|
{
|
||||||
# contains exactly the subtasks that started at or after the
|
"role": "assistant",
|
||||||
# current span. Saves the runtime from having to derive
|
"content": plan_text,
|
||||||
# "what's still left" at inference time.
|
"style": "plan",
|
||||||
for span in subtask_spans:
|
"timestamp": float(t0),
|
||||||
boundary_t = _snap_to_frame(span["start"], record.frame_timestamps)
|
"tool_calls": None,
|
||||||
plan_text = self._generate_plan(
|
}
|
||||||
record, subtask_spans, refresh_t=boundary_t, task=effective_task
|
|
||||||
)
|
)
|
||||||
if plan_text is not None:
|
|
||||||
rows.append(
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": plan_text,
|
|
||||||
"style": "plan",
|
|
||||||
"timestamp": float(boundary_t),
|
|
||||||
"tool_calls": None,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
# memory rows at every subtask boundary except the very first start
|
# memory rows at every subtask boundary except the very first start
|
||||||
prior_memory = ""
|
prior_memory = ""
|
||||||
for i, span in enumerate(subtask_spans[1:], start=1):
|
for i, span in enumerate(subtask_spans[1:], start=1):
|
||||||
@@ -383,50 +372,54 @@ class PlanSubtasksMemoryModule:
|
|||||||
|
|
||||||
def _generate_plan(
|
def _generate_plan(
|
||||||
self,
|
self,
|
||||||
record: EpisodeRecord, # noqa: ARG002 (kept for signature stability)
|
record: EpisodeRecord,
|
||||||
subtask_spans: Sequence[dict[str, Any]],
|
subtask_spans: Sequence[dict[str, Any]],
|
||||||
*,
|
*,
|
||||||
refresh_t: float | None = None,
|
refresh_t: float | None = None,
|
||||||
interjection: str | None = None, # noqa: ARG002
|
interjection: str | None = None,
|
||||||
task: str | None = None, # noqa: ARG002
|
task: str | None = None,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
"""Deterministic plan = numbered list of *still-todo* subtasks.
|
|
||||||
|
|
||||||
Previously this called the VLM with a prompt that asked it to
|
|
||||||
compress the subtasks into a "compact hierarchical plan". That
|
|
||||||
produced longer-than-necessary plans, cost an extra VLM round-trip
|
|
||||||
per episode (plus one per interjection on refresh), and could
|
|
||||||
diverge from the actual subtask sequence the model is going to
|
|
||||||
execute. Replacing it with a plain summarisation keeps the plan
|
|
||||||
tightly aligned with the upcoming subtasks and removes the VLM
|
|
||||||
call entirely.
|
|
||||||
|
|
||||||
Layout (matches the v2 plan style — short imperative fragments
|
|
||||||
prefixed by "N. "):
|
|
||||||
|
|
||||||
1. <subtask 1>
|
|
||||||
2. <subtask 2>
|
|
||||||
...
|
|
||||||
|
|
||||||
On a refresh at ``refresh_t`` (called from ``run_plan_updates``
|
|
||||||
on interjection events), only subtasks whose start is at or
|
|
||||||
after ``refresh_t`` are included — the plan shrinks as work
|
|
||||||
progresses, so it always describes what's left.
|
|
||||||
"""
|
|
||||||
if not subtask_spans:
|
if not subtask_spans:
|
||||||
return None
|
return None
|
||||||
remaining = [
|
subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
|
||||||
s for s in subtask_spans
|
prompt = load_prompt("module_1_plan").format(
|
||||||
if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
|
episode_task=(task if task is not None else record.episode_task),
|
||||||
]
|
subtasks_text=subtasks_text,
|
||||||
if not remaining:
|
plan_max_steps=self.config.plan_max_steps,
|
||||||
# Past the last subtask boundary on a late refresh — nothing
|
|
||||||
# left to plan; emit None so the caller skips the row.
|
|
||||||
return None
|
|
||||||
return "\n".join(
|
|
||||||
f"{i}. {span.get('text', '').strip()}"
|
|
||||||
for i, span in enumerate(remaining, start=1)
|
|
||||||
)
|
)
|
||||||
|
if refresh_t is not None:
|
||||||
|
# ``current_subtask`` is the span the refresh time falls into,
|
||||||
|
# so the model knows where in the demonstration the planner is
|
||||||
|
# standing when it re-emits.
|
||||||
|
current_subtask = ""
|
||||||
|
for span in subtask_spans:
|
||||||
|
if float(span["start"]) <= refresh_t and (
|
||||||
|
"end" not in span or float(span["end"]) > refresh_t
|
||||||
|
):
|
||||||
|
current_subtask = span.get("text", "")
|
||||||
|
break
|
||||||
|
if interjection:
|
||||||
|
prompt += (
|
||||||
|
f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
|
||||||
|
f"interjection: {interjection!r}. Current subtask just "
|
||||||
|
f"before the interjection: {current_subtask!r}. Update "
|
||||||
|
f"the plan so it reflects the interjection — drop or "
|
||||||
|
f"reorder steps as needed; do not just restate.)\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Refresh without an interjection text: still tell the model
|
||||||
|
# where in the episode the plan stands so the re-emission
|
||||||
|
# is grounded. Should be rare — plan refreshes are
|
||||||
|
# interjection-driven by design.
|
||||||
|
prompt += (
|
||||||
|
f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current "
|
||||||
|
f"subtask: {current_subtask!r}.)\n"
|
||||||
|
)
|
||||||
|
messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
|
||||||
|
result = self.vlm.generate_json([messages])[0]
|
||||||
|
if isinstance(result, dict) and isinstance(result.get("plan"), str):
|
||||||
|
return result["plan"].strip()
|
||||||
|
return None
|
||||||
|
|
||||||
def _generate_memory(
|
def _generate_memory(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
You are the high-level planner for a robot demonstrating: "{episode_task}".
|
||||||
|
|
||||||
|
Given the subtask decomposition below, write a concise hierarchical PLAN
|
||||||
|
the robot should follow. Format the plan as a numbered list, one line per
|
||||||
|
high-level step. The plan describes the full task; subtasks are the atomic
|
||||||
|
skills used to execute it.
|
||||||
|
|
||||||
|
Subtasks for context:
|
||||||
|
{subtasks_text}
|
||||||
|
|
||||||
|
Authoring rules:
|
||||||
|
- 3 to {plan_max_steps} steps.
|
||||||
|
- Each step describes one logical chunk of the task, not one motion.
|
||||||
|
- Steps must be in execution order.
|
||||||
|
- Plain prose, no JSON, no markdown headers.
|
||||||
|
|
||||||
|
Output strictly valid JSON:
|
||||||
|
{{ "plan": "1. ...\n2. ...\n3. ..." }}
|
||||||
@@ -4,24 +4,17 @@ The user originally asked: "{episode_task}"
|
|||||||
|
|
||||||
You are shown the entire demonstration as a single video. Watch the
|
You are shown the entire demonstration as a single video. Watch the
|
||||||
whole clip, then segment it into a list of consecutive atomic subtasks
|
whole clip, then segment it into a list of consecutive atomic subtasks
|
||||||
the robot performs. Write **telegraphic** action labels.
|
the robot performs.
|
||||||
|
|
||||||
Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
|
Authoring rules — based on Hi Robot (Shi 2025) atom granularity and
|
||||||
|
Pi0.7 (Physical Intelligence 2025) "how, not what" detail:
|
||||||
|
|
||||||
- Each subtask = one atomic skill the low-level policy can execute.
|
- Each subtask is one atomic skill the low-level policy can execute,
|
||||||
- **Hard length cap: ≤ 4 words.** Ideally 2-3. Form: VERB + (color) +
|
e.g. "pick up one piece of lettuce", "place the bowl into the box",
|
||||||
OBJECT. No articles ("the", "a"), no destinations, no adverbs, no
|
"move the right arm to the left".
|
||||||
"robot"/"arm"/"gripper" — those are implied.
|
- Capture HOW the subtask is performed, not only WHAT — e.g. prefer
|
||||||
- **Use the exact object nouns from the task above.** If the task says
|
"grasp the handle of the sponge with the left hand" to "pick up the
|
||||||
"cube", every subtask says "cube" — never switch to "block". If it
|
sponge".
|
||||||
says "box", never switch to "bin"/"container". Consistent vocabulary
|
|
||||||
across the whole episode.
|
|
||||||
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
|
|
||||||
"place blue cube", "open drawer", "release yellow cube".
|
|
||||||
- Bad: "release the yellow block into the green bin" (articles,
|
|
||||||
destination, "block" instead of "cube"), "the robot arm moves
|
|
||||||
towards the blue cube" ("the robot arm", too long), "carefully
|
|
||||||
pick up the cube" (adverb, article).
|
|
||||||
- Subtasks are non-overlapping and cover the full episode in order.
|
- Subtasks are non-overlapping and cover the full episode in order.
|
||||||
Choose the cut points yourself based on what you see in the video
|
Choose the cut points yourself based on what you see in the video
|
||||||
(gripper open/close events, contact, regrasps, transitions).
|
(gripper open/close events, contact, regrasps, transitions).
|
||||||
@@ -34,7 +27,7 @@ Output strictly valid JSON of shape:
|
|||||||
|
|
||||||
{{
|
{{
|
||||||
"subtasks": [
|
"subtasks": [
|
||||||
{{"text": "<≤4-word verb phrase>", "start": <float>, "end": <float>}},
|
{{"text": "<how-not-what>", "start": <float>, "end": <float>}},
|
||||||
...
|
...
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
|
|||||||
@@ -14,10 +14,12 @@ subtask boundary in the demonstration:
|
|||||||
- Subtask the robot is about to start: "{next_subtask}"
|
- Subtask the robot is about to start: "{next_subtask}"
|
||||||
- Time into episode: {timestamp:.2f}s
|
- Time into episode: {timestamp:.2f}s
|
||||||
|
|
||||||
Write ONE compact interjection the user would naturally say at this
|
Write ONE interjection the user would naturally say at this moment to
|
||||||
moment to prompt / confirm / encourage the robot to do "{next_subtask}".
|
prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
|
||||||
Keep it like a mid-task coaching cue, not a full instruction paragraph.
|
like a real human mid-task remark — conversational, varied, sometimes
|
||||||
Also write the robot's compact verbal acknowledgement.
|
just a nudge, sometimes a clarification, sometimes a small constraint
|
||||||
|
that the upcoming motion happens to satisfy. Plus the robot's verbal
|
||||||
|
acknowledgement.
|
||||||
|
|
||||||
Hard rules:
|
Hard rules:
|
||||||
|
|
||||||
@@ -27,9 +29,7 @@ Hard rules:
|
|||||||
instead", DO NOT — those would contradict the demonstration.
|
instead", DO NOT — those would contradict the demonstration.
|
||||||
- The interjection must reference an object, location, or action that
|
- The interjection must reference an object, location, or action that
|
||||||
is plausible given the visible scene and the next subtask text.
|
is plausible given the visible scene and the next subtask text.
|
||||||
- One short phrase or sentence each. Conversational, not robotic.
|
- One sentence each. Conversational, not robotic.
|
||||||
- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
|
|
||||||
- Keep robot speech very short: "OK.", "On it.", "Doing that."
|
|
||||||
|
|
||||||
Style examples (vary the phrasing — don't reuse these verbatim):
|
Style examples (vary the phrasing — don't reuse these verbatim):
|
||||||
- "Now go ahead and {next_subtask}."
|
- "Now go ahead and {next_subtask}."
|
||||||
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):
|
|||||||
|
|
||||||
Output strictly valid JSON:
|
Output strictly valid JSON:
|
||||||
{{
|
{{
|
||||||
"interjection": "<short cue from the user, asking for the next subtask>",
|
"interjection": "<single sentence the user says, asking for the next subtask>",
|
||||||
"speech": "<short robot acknowledgement>"
|
"speech": "<single sentence the robot speaks back, confirming and starting>"
|
||||||
}}
|
}}
|
||||||
|
|||||||
@@ -141,43 +141,6 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
|
|||||||
)
|
)
|
||||||
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
|
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
|
||||||
|
|
||||||
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
|
|
||||||
# resolves the dataset revision via ``get_safe_version`` which scans
|
|
||||||
# for tags like ``v3.0``; without a tag it raises
|
|
||||||
# ``RevisionNotFoundError``. Read the version straight from the
|
|
||||||
# dataset's own ``meta/info.json`` so we tag whatever the writer
|
|
||||||
# actually wrote (no accidental drift if the codebase floor moves).
|
|
||||||
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
|
|
||||||
|
|
||||||
info_path = root / "meta" / "info.json"
|
|
||||||
version_tag = CODEBASE_VERSION
|
|
||||||
if info_path.exists():
|
|
||||||
try:
|
|
||||||
from lerobot.utils.io_utils import load_json # noqa: PLC0415
|
|
||||||
|
|
||||||
info = load_json(info_path)
|
|
||||||
ds_version = info.get("codebase_version")
|
|
||||||
if isinstance(ds_version, str) and ds_version.startswith("v"):
|
|
||||||
version_tag = ds_version
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
|
|
||||||
try:
|
|
||||||
api.create_tag(
|
|
||||||
repo_id=repo_id,
|
|
||||||
tag=version_tag,
|
|
||||||
repo_type="dataset",
|
|
||||||
exist_ok=True,
|
|
||||||
)
|
|
||||||
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
print(
|
|
||||||
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
|
|
||||||
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
|
|
||||||
"Run: from huggingface_hub import HfApi; "
|
|
||||||
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
annotate()
|
annotate()
|
||||||
|
|||||||
Reference in New Issue
Block a user