mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-25 05:29:55 +00:00
feat(annotate): deterministic plan, single-frame VQA, dataset tagging
Port the steerable-pipeline refinements developed on feat/smolvla-on- steerable back into the annotation pipeline itself: - module_1_subtasks: imperative verb-first telegraphic labels with a consistent-object-noun rule and good/bad examples (no hard word cap). - _generate_plan: drop the VLM round-trip; the plan is now a deterministic numbered list of still-todo subtasks, re-emitted at every subtask boundary so it shrinks as work progresses. Removes module_1_plan.txt. - VqaConfig.K 3 -> 1: a VQA pair anchors exactly its emission frame, no stale-label temporal smear. - lerobot-annotate: tag the pushed dataset with its codebase_version so LeRobotDataset can resolve a revision and load it. - module_2_interjection: shorter, more natural mid-task cues. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -83,7 +83,14 @@ class VqaConfig:
|
|||||||
|
|
||||||
enabled: bool = True
|
enabled: bool = True
|
||||||
vqa_emission_hz: float = 1.0
|
vqa_emission_hz: float = 1.0
|
||||||
K: int = 3
|
K: int = 1
|
||||||
|
"""How many *consecutive* frames each emission tick anchors a VQA pair
|
||||||
|
to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
|
||||||
|
against the *first* anchored frame's image, so anchoring K>1 frames
|
||||||
|
copies that same answer onto later frames where the scene has already
|
||||||
|
moved — stale labels. Default ``1``: a VQA pair lands on exactly its
|
||||||
|
emission frame, no temporal smear. Raise it only to trade label
|
||||||
|
precision for more (noisier) VQA frames."""
|
||||||
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
|
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -104,18 +104,29 @@ class PlanSubtasksMemoryModule:
|
|||||||
"tool_calls": None,
|
"tool_calls": None,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# plan row at t=0
|
# Plan rows at every subtask boundary — including t=0 (start of
|
||||||
plan_text = self._generate_plan(record, subtask_spans, task=effective_task)
|
# the first subtask). Because the plan is just a numbered list
|
||||||
if plan_text is not None:
|
# of *still-todo* subtasks, re-emitting at each boundary makes
|
||||||
rows.append(
|
# the active plan shrink as work progresses: at frame t the
|
||||||
{
|
# rendered ``${plan}`` is the most recent emission, which
|
||||||
"role": "assistant",
|
# contains exactly the subtasks that started at or after the
|
||||||
"content": plan_text,
|
# current span. Saves the runtime from having to derive
|
||||||
"style": "plan",
|
# "what's still left" at inference time.
|
||||||
"timestamp": float(t0),
|
for span in subtask_spans:
|
||||||
"tool_calls": None,
|
boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
|
||||||
}
|
plan_text = self._generate_plan(
|
||||||
|
record, subtask_spans, refresh_t=boundary_t, task=effective_task
|
||||||
)
|
)
|
||||||
|
if plan_text is not None:
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": plan_text,
|
||||||
|
"style": "plan",
|
||||||
|
"timestamp": float(boundary_t),
|
||||||
|
"tool_calls": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
# memory rows at every subtask boundary except the very first start
|
# memory rows at every subtask boundary except the very first start
|
||||||
prior_memory = ""
|
prior_memory = ""
|
||||||
for i, span in enumerate(subtask_spans[1:], start=1):
|
for i, span in enumerate(subtask_spans[1:], start=1):
|
||||||
@@ -327,48 +338,50 @@ class PlanSubtasksMemoryModule:
|
|||||||
|
|
||||||
def _generate_plan(
|
def _generate_plan(
|
||||||
self,
|
self,
|
||||||
record: EpisodeRecord,
|
record: EpisodeRecord, # noqa: ARG002 (kept for signature stability)
|
||||||
subtask_spans: Sequence[dict[str, Any]],
|
subtask_spans: Sequence[dict[str, Any]],
|
||||||
*,
|
*,
|
||||||
refresh_t: float | None = None,
|
refresh_t: float | None = None,
|
||||||
interjection: str | None = None,
|
interjection: str | None = None, # noqa: ARG002
|
||||||
task: str | None = None,
|
task: str | None = None, # noqa: ARG002
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
|
"""Deterministic plan = numbered list of *still-todo* subtasks.
|
||||||
|
|
||||||
|
Previously this called the VLM with a prompt that asked it to
|
||||||
|
compress the subtasks into a "compact hierarchical plan". That
|
||||||
|
produced longer-than-necessary plans, cost an extra VLM round-trip
|
||||||
|
per episode (plus one per interjection on refresh), and could
|
||||||
|
diverge from the actual subtask sequence the model is going to
|
||||||
|
execute. Replacing it with a plain summarisation keeps the plan
|
||||||
|
tightly aligned with the upcoming subtasks and removes the VLM
|
||||||
|
call entirely.
|
||||||
|
|
||||||
|
Layout — short imperative fragments prefixed by "N. ":
|
||||||
|
|
||||||
|
1. <subtask 1>
|
||||||
|
2. <subtask 2>
|
||||||
|
...
|
||||||
|
|
||||||
|
On a refresh at ``refresh_t`` (called from ``run_plan_updates``
|
||||||
|
on interjection events, and from ``run_episode`` at every subtask
|
||||||
|
boundary), only subtasks whose start is at or after ``refresh_t``
|
||||||
|
are included — the plan shrinks as work progresses, so it always
|
||||||
|
describes what's left.
|
||||||
|
"""
|
||||||
if not subtask_spans:
|
if not subtask_spans:
|
||||||
return None
|
return None
|
||||||
subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans)
|
remaining = [
|
||||||
prompt = load_prompt("module_1_plan").format(
|
s
|
||||||
episode_task=(task if task is not None else record.episode_task),
|
for s in subtask_spans
|
||||||
subtasks_text=subtasks_text,
|
if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
|
||||||
plan_max_steps=self.config.plan_max_steps,
|
]
|
||||||
|
if not remaining:
|
||||||
|
# Past the last subtask boundary on a late refresh — nothing
|
||||||
|
# left to plan; emit None so the caller skips the row.
|
||||||
|
return None
|
||||||
|
return "\n".join(
|
||||||
|
f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1)
|
||||||
)
|
)
|
||||||
if refresh_t is not None:
|
|
||||||
# ``current_subtask`` is the span the refresh time falls into,
|
|
||||||
# so the model knows where in the demonstration the planner is
|
|
||||||
# standing when it re-emits.
|
|
||||||
current_subtask = ""
|
|
||||||
for span in subtask_spans:
|
|
||||||
if float(span["start"]) <= refresh_t and (
|
|
||||||
"end" not in span or float(span["end"]) > refresh_t
|
|
||||||
):
|
|
||||||
current_subtask = span.get("text", "")
|
|
||||||
break
|
|
||||||
if interjection:
|
|
||||||
prompt += (
|
|
||||||
f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user "
|
|
||||||
f"interjection: {interjection!r}. Current subtask just "
|
|
||||||
f"before the interjection: {current_subtask!r}. Update "
|
|
||||||
f"the plan so it reflects the interjection — drop or "
|
|
||||||
f"reorder steps as needed; do not just restate.)\n"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Refresh without an interjection text: still tell the model
|
|
||||||
# where in the episode the plan stands so the re-emission
|
|
||||||
# is grounded. Should be rare — plan refreshes are
|
|
||||||
# interjection-driven by design.
|
|
||||||
prompt += f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current subtask: {current_subtask!r}.)\n"
|
|
||||||
plan = self._vlm_field(self._text_message(prompt), "plan")
|
|
||||||
return plan.strip() if isinstance(plan, str) else None
|
|
||||||
|
|
||||||
def _generate_memory(
|
def _generate_memory(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
You are the high-level planner for a robot demonstrating: "{episode_task}".
|
|
||||||
|
|
||||||
Given the subtask decomposition below, write a concise hierarchical PLAN
|
|
||||||
the robot should follow. Format the plan as a numbered list, one line per
|
|
||||||
high-level step. The plan describes the full task; subtasks are the atomic
|
|
||||||
skills used to execute it.
|
|
||||||
|
|
||||||
Subtasks for context:
|
|
||||||
{subtasks_text}
|
|
||||||
|
|
||||||
Authoring rules:
|
|
||||||
- 3 to {plan_max_steps} steps.
|
|
||||||
- Each step describes one logical chunk of the task, not one motion.
|
|
||||||
- Steps must be in execution order.
|
|
||||||
- Plain prose, no JSON, no markdown headers.
|
|
||||||
|
|
||||||
Output strictly valid JSON:
|
|
||||||
{{ "plan": "1. ...\n2. ...\n3. ..." }}
|
|
||||||
@@ -4,20 +4,30 @@ The user originally asked: "{episode_task}"
|
|||||||
|
|
||||||
You are shown the entire demonstration as a single video. Watch the
|
You are shown the entire demonstration as a single video. Watch the
|
||||||
whole clip, then segment it into a list of consecutive atomic subtasks
|
whole clip, then segment it into a list of consecutive atomic subtasks
|
||||||
the robot performs.
|
the robot performs. Write short, telegraphic action labels.
|
||||||
|
|
||||||
Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
|
Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
|
||||||
|
|
||||||
- Each subtask is one atomic skill the low-level policy can execute,
|
- Each subtask = one atomic skill the low-level policy can execute.
|
||||||
e.g. "pick up the orange", "place the bowl into the box".
|
- Write each subtask as an IMPERATIVE COMMAND, starting with a verb:
|
||||||
- Write each subtask as an IMPERATIVE COMMAND to the robot, starting
|
move, reach, pick up, grasp, place, put, push, pull, open, close,
|
||||||
with a verb: move, reach, pick up, grasp, place, put, push, pull,
|
turn, press, lift, insert, pour...
|
||||||
open, close, turn, press, lift, insert, pour...
|
- Keep it SHORT — a verb phrase, not a sentence. Drop articles
|
||||||
|
("the", "a") and adverbs ("carefully", "slowly"). Add a "how"
|
||||||
|
detail (which hand, which grasp point) ONLY when it is needed to
|
||||||
|
disambiguate.
|
||||||
- NEVER use third person. Never write "the robot", "the arm", "the
|
- NEVER use third person. Never write "the robot", "the arm", "the
|
||||||
gripper moves", "it picks up". Command the robot, do not describe it.
|
gripper moves", "it picks up" — the robot is implied. Command it,
|
||||||
- Keep it SHORT — 3 to 8 words. Add a "how" detail (which hand, which
|
do not describe it.
|
||||||
grasp point) ONLY when it is needed to disambiguate.
|
- Use the exact object nouns from the task above. If the task says
|
||||||
- Lower-case, no trailing period.
|
"cube", every subtask says "cube" — never switch to "block". If it
|
||||||
|
says "box", never switch to "bin"/"container". Keep vocabulary
|
||||||
|
consistent across the whole episode.
|
||||||
|
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
|
||||||
|
"place blue cube in box", "open drawer", "release yellow cube".
|
||||||
|
- Bad: "the robot arm moves towards the blue cube" (third person,
|
||||||
|
too long), "carefully pick up the cube" (adverb, article),
|
||||||
|
"release the yellow block" ("block" when the task said "cube").
|
||||||
- Subtasks are non-overlapping and cover the full episode in order.
|
- Subtasks are non-overlapping and cover the full episode in order.
|
||||||
Choose the cut points yourself based on what you see in the video
|
Choose the cut points yourself based on what you see in the video
|
||||||
(gripper open/close events, contact, regrasps, transitions).
|
(gripper open/close events, contact, regrasps, transitions).
|
||||||
@@ -26,22 +36,11 @@ Authoring rules — based on Hi Robot (Shi 2025) atom granularity:
|
|||||||
- Every subtask's [start_time, end_time] must lie within
|
- Every subtask's [start_time, end_time] must lie within
|
||||||
[0.0, {episode_duration}] seconds.
|
[0.0, {episode_duration}] seconds.
|
||||||
|
|
||||||
Style examples:
|
|
||||||
|
|
||||||
Good Bad (do NOT produce these)
|
|
||||||
"pick up the orange" "the robot arm moves to the orange"
|
|
||||||
"move to the yellow block" "the gripper approaches the block"
|
|
||||||
"close gripper to grasp "close the gripper to grasp the
|
|
||||||
the yellow cube" yellow cube so it can lift it"
|
|
||||||
"open the toaster oven" "it opens the toaster oven door"
|
|
||||||
"put the bagel on the "the white plate now has the bagel
|
|
||||||
white plate" placed on it by the arm"
|
|
||||||
|
|
||||||
Output strictly valid JSON of shape:
|
Output strictly valid JSON of shape:
|
||||||
|
|
||||||
{{
|
{{
|
||||||
"subtasks": [
|
"subtasks": [
|
||||||
{{"text": "<short imperative command>", "start": <float>, "end": <float>}},
|
{{"text": "<short imperative verb phrase>", "start": <float>, "end": <float>}},
|
||||||
...
|
...
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
|
|||||||
@@ -14,12 +14,10 @@ subtask boundary in the demonstration:
|
|||||||
- Subtask the robot is about to start: "{next_subtask}"
|
- Subtask the robot is about to start: "{next_subtask}"
|
||||||
- Time into episode: {timestamp:.2f}s
|
- Time into episode: {timestamp:.2f}s
|
||||||
|
|
||||||
Write ONE interjection the user would naturally say at this moment to
|
Write ONE compact interjection the user would naturally say at this
|
||||||
prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
|
moment to prompt / confirm / encourage the robot to do "{next_subtask}".
|
||||||
like a real human mid-task remark — conversational, varied, sometimes
|
Keep it like a mid-task coaching cue, not a full instruction paragraph.
|
||||||
just a nudge, sometimes a clarification, sometimes a small constraint
|
Also write the robot's compact verbal acknowledgement.
|
||||||
that the upcoming motion happens to satisfy. Plus the robot's verbal
|
|
||||||
acknowledgement.
|
|
||||||
|
|
||||||
Hard rules:
|
Hard rules:
|
||||||
|
|
||||||
@@ -29,7 +27,9 @@ Hard rules:
|
|||||||
instead", DO NOT — those would contradict the demonstration.
|
instead", DO NOT — those would contradict the demonstration.
|
||||||
- The interjection must reference an object, location, or action that
|
- The interjection must reference an object, location, or action that
|
||||||
is plausible given the visible scene and the next subtask text.
|
is plausible given the visible scene and the next subtask text.
|
||||||
- One sentence each. Conversational, not robotic.
|
- One short phrase or sentence each. Conversational, not robotic.
|
||||||
|
- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}."
|
||||||
|
- Keep robot speech very short: "OK.", "On it.", "Doing that."
|
||||||
|
|
||||||
Style examples (vary the phrasing — don't reuse these verbatim):
|
Style examples (vary the phrasing — don't reuse these verbatim):
|
||||||
- "Now go ahead and {next_subtask}."
|
- "Now go ahead and {next_subtask}."
|
||||||
@@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim):
|
|||||||
|
|
||||||
Output strictly valid JSON:
|
Output strictly valid JSON:
|
||||||
{{
|
{{
|
||||||
"interjection": "<single sentence the user says, asking for the next subtask>",
|
"interjection": "<short cue from the user, asking for the next subtask>",
|
||||||
"speech": "<single sentence the robot speaks back, confirming and starting>"
|
"speech": "<short robot acknowledgement>"
|
||||||
}}
|
}}
|
||||||
|
|||||||
@@ -149,6 +149,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
|
|||||||
)
|
)
|
||||||
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
|
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
|
||||||
|
|
||||||
|
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
|
||||||
|
# resolves the dataset revision via ``get_safe_version`` which scans
|
||||||
|
# for tags like ``v3.0``; without a tag it raises
|
||||||
|
# ``RevisionNotFoundError``. Read the version straight from the
|
||||||
|
# dataset's own ``meta/info.json`` so we tag whatever the writer
|
||||||
|
# actually wrote (no accidental drift if the codebase floor moves).
|
||||||
|
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
|
||||||
|
|
||||||
|
info_path = root / "meta" / "info.json"
|
||||||
|
version_tag = CODEBASE_VERSION
|
||||||
|
if info_path.exists():
|
||||||
|
try:
|
||||||
|
from lerobot.utils.io_utils import load_json # noqa: PLC0415
|
||||||
|
|
||||||
|
info = load_json(info_path)
|
||||||
|
ds_version = info.get("codebase_version")
|
||||||
|
if isinstance(ds_version, str) and ds_version.startswith("v"):
|
||||||
|
version_tag = ds_version
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
|
||||||
|
try:
|
||||||
|
api.create_tag(
|
||||||
|
repo_id=repo_id,
|
||||||
|
tag=version_tag,
|
||||||
|
repo_type="dataset",
|
||||||
|
exist_ok=True,
|
||||||
|
)
|
||||||
|
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
print(
|
||||||
|
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
|
||||||
|
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
|
||||||
|
"Run: from huggingface_hub import HfApi; "
|
||||||
|
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
annotate()
|
annotate()
|
||||||
|
|||||||
@@ -80,7 +80,6 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
|
|||||||
{"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
|
{"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"concise hierarchical PLAN": {"plan": "1. grasp\n2. wipe\n3. place"},
|
|
||||||
"Update the memory": {"memory": "wiped the counter once"},
|
"Update the memory": {"memory": "wiped the counter once"},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -96,10 +95,16 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
|
|||||||
frame_set = set(record.frame_timestamps)
|
frame_set = set(record.frame_timestamps)
|
||||||
for row in rows:
|
for row in rows:
|
||||||
assert row["timestamp"] in frame_set
|
assert row["timestamp"] in frame_set
|
||||||
# exactly one plan row at t0
|
# one plan row per subtask boundary; the first lands at t0 and each
|
||||||
plan_rows = [r for r in rows if r["style"] == "plan"]
|
# plan is the deterministic numbered list of still-todo subtasks
|
||||||
assert len(plan_rows) == 1
|
plan_rows = sorted((r for r in rows if r["style"] == "plan"), key=lambda r: r["timestamp"])
|
||||||
|
subtask_rows = [r for r in rows if r["style"] == "subtask"]
|
||||||
|
assert len(plan_rows) == len(subtask_rows)
|
||||||
assert plan_rows[0]["timestamp"] == record.frame_timestamps[0]
|
assert plan_rows[0]["timestamp"] == record.frame_timestamps[0]
|
||||||
|
# the t0 plan enumerates all subtasks; later plans shrink
|
||||||
|
assert plan_rows[0]["content"].startswith("1. ")
|
||||||
|
assert len(plan_rows[0]["content"].splitlines()) == len(subtask_rows)
|
||||||
|
assert len(plan_rows[-1]["content"].splitlines()) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: Path, tmp_path: Path) -> None:
|
def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: Path, tmp_path: Path) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user