diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 85ac8f17c..3fb730d4a 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -87,8 +87,6 @@ CMD = ( # rephrasings are unused at best and harmful when they drift. "--plan.n_task_rephrasings=0 " # Keep subtask decomposition tight for atomic tasks. - # (action_records left off: the {verb,object,arm,grasp,dest} schema is for - # long manipulation tasks, not RoboCasa atomic/navigation.) "--plan.plan_max_steps=10 " # Only subtasks + memory — skip the numbered "plan" rows. true re-enables. "--plan.emit_plan=false " diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 1b16a927b..439201993 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -75,11 +75,6 @@ class PlanConfig: use_video_url: bool = False use_video_url_fps: float = 1.0 - # Optional structured per-subtask action records (EgoMimic-style). When - # enabled, the VLM extracts a typed record per subtask span; see - # ``ActionRecordsConfig``. Purely additive — off by default. - action_records: ActionRecordsConfig = field(default_factory=lambda: ActionRecordsConfig()) - # Optional 5-axis task-augmentation taxonomy for the t=0 variants # (EgoMimic-style: synonym / omit_arm / omit_orientation / # omit_grasp_method / combined). Replaces the free-form @@ -87,73 +82,6 @@ class PlanConfig: task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig()) -@dataclass -class ActionRecordsConfig: - """Structured per-subtask action record extraction. - - When ``enabled=True``, after subtask-span generation the module makes - one extra VLM call per subtask to extract a typed record:: - - { - "verb": "pick" | "place" | "press" | ..., # closed vocabulary - "object": "", - "arm": "left" | "right" | "both" | null, - "grasp_type": "pinch" | "wrap" | "hook" | ... | null, - "destination": "" | null, - "mistake": "" | null, - } - - Emitted as a separate ``style="action_record"`` row at the subtask's - start timestamp. PURELY ADDITIVE — it never touches the subtask text, - so downstream training can use the typed schema (e.g. auxiliary - verb/arm/grasp heads) while the conditioning string stays unchanged. - - Cost: one extra VLM call per subtask (~8x plan-module calls on an - 8-subtask episode). - """ - - enabled: bool = False - - # Emit the ``style="action_record"`` row (JSON content) at the subtask - # start — the only output of the feature. ``enabled=False`` skips it. - emit_record_row: bool = True - - # Frames sampled from the subtask span for the per-subtask VLM call. - frames_per_subtask: int = 4 - - # Closed verb vocabulary; the prompt picks exactly one. Override - # per-dataset (e.g. door-only manipulation) for a tighter constraint. - verb_vocabulary: tuple[str, ...] = ( - "pick", - "place", - "push", - "pull", - "open", - "close", - "turn", - "press", - "lift", - "insert", - "pour", - "move", - "reach", - "grasp", - "release", - "wipe", - "dump", - ) - - # Closed grasp-type vocabulary (``null`` always allowed). Adjust - # per-hardware (e.g. drop ``hook`` / ``key`` for parallel-jaw grippers). - grasp_vocabulary: tuple[str, ...] = ( - "pinch", - "wrap", - "hook", - "key", - "lateral", - ) - - @dataclass class TaskAugAxesConfig: """Structured 5-axis augmentation taxonomy for t=0 task variants. diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index c76a6acad..8f25fcfba 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -17,7 +17,6 @@ from __future__ import annotations -import json import logging from collections.abc import Sequence from dataclasses import dataclass, field @@ -29,7 +28,6 @@ from ..frames import ( FrameProvider, VideoFrameProvider, null_provider, - to_image_blocks, to_video_block, to_video_url_block, ) @@ -84,20 +82,8 @@ class PlanSubtasksMemoryModule: subtask_spans = self._generate_subtasks(record, task=effective_task) - # Phase 1a: optional per-subtask action records. When enabled, emit a - # typed ActionRecord (verb/object/arm/grasp_type/destination/mistake) - # per span as a separate style="action_record" row. Purely additive — - # never touches the subtask text. - records_cfg = self.config.action_records - action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans) - if records_cfg.enabled and subtask_spans: - for i, span in enumerate(subtask_spans): - rec = self._extract_action_record(record, span, effective_task) - if rec is not None: - action_records[i] = rec - # subtask rows - for i, span in enumerate(subtask_spans): + for span in subtask_spans: rows.append( { "role": "assistant", @@ -107,16 +93,6 @@ class PlanSubtasksMemoryModule: "tool_calls": None, } ) - if records_cfg.enabled and records_cfg.emit_record_row and action_records[i] is not None: - rows.append( - { - "role": "assistant", - "content": json.dumps(action_records[i], sort_keys=True), - "style": "action_record", - "timestamp": snap_to_frame(span["start"], record.frame_timestamps), - "tool_calls": None, - } - ) # Plan rows at every subtask boundary (incl. t=0). The plan is a # numbered list of still-todo subtasks, so re-emitting at each # boundary makes it shrink as work progresses — ${plan} at frame t is @@ -264,107 +240,6 @@ class PlanSubtasksMemoryModule: out = [item.strip().strip('"').strip("'") for item in raw if isinstance(item, str)] return [s for s in out if s][:n] - # ------------------------------------------------------------------ - # Phase 1a + 1b: structured per-subtask action records - # ------------------------------------------------------------------ - - def _extract_action_record( - self, - record: EpisodeRecord, - span: dict[str, Any], - episode_task: str, - ) -> dict[str, Any] | None: - """Ask the VLM to extract a typed ``ActionRecord`` from a subtask span. - - Sends ``frames_per_subtask`` frames uniformly sampled from - ``[span.start, span.end]`` plus the canonical subtask text. The - VLM is constrained to verb + grasp vocabularies from the config - — invalid values are silently dropped at this layer (the - validator catches structural problems pre-write). - - Returns ``None`` when the call fails or the VLM returns something - unrecognizable; callers fall back to the free-form subtask text. - """ - cfg = self.config.action_records - start_t = float(span.get("start", 0.0)) - end_t = float(span.get("end", start_t)) - duration = max(0.0, end_t - start_t) - - # Uniform timestamps within the span; fall back to a single - # center frame for very short spans. - n = max(1, int(cfg.frames_per_subtask)) - if n == 1 or duration <= 0.0: - timestamps = [0.5 * (start_t + end_t)] - else: - step = duration / (n - 1) - timestamps = [start_t + i * step for i in range(n)] - frames = self.frame_provider.frames_at(record, timestamps) - if not frames: - logger.debug( - "action_record: no frames at span %.2f-%.2f for ep %s; skipping", - start_t, - end_t, - record.episode_index, - ) - return None - - prompt = load_prompt("plan_action_record").format( - episode_task=episode_task, - subtask_text=span.get("text", ""), - start_time=start_t, - end_time=end_t, - duration=duration, - n_frames=len(frames), - verb_vocabulary=", ".join(cfg.verb_vocabulary), - grasp_vocabulary=" | ".join(f'"{g}"' for g in cfg.grasp_vocabulary), - ) - message = [ - { - "role": "user", - "content": [*to_image_blocks(frames), {"type": "text", "text": prompt}], - } - ] - result = self.vlm.generate_json([message])[0] - if not isinstance(result, dict): - return None - - # Light validation + normalisation. Verb is required; everything - # else may be null. Verb / grasp_type are clamped to the - # vocabularies (out-of-vocab → reject or null). - verb = (result.get("verb") or "").strip().lower() - if not verb or verb not in {v.lower() for v in cfg.verb_vocabulary}: - return None - obj = (result.get("object") or "").strip() - if not obj: - return None - grasp = result.get("grasp_type") - if isinstance(grasp, str): - grasp = grasp.strip().lower() - if grasp not in {g.lower() for g in cfg.grasp_vocabulary}: - grasp = None - else: - grasp = None - arm = result.get("arm") - if isinstance(arm, str): - arm = arm.strip().lower() - if arm not in {"left", "right", "both"}: - arm = None - else: - arm = None - destination = result.get("destination") - destination = destination.strip() if isinstance(destination, str) and destination.strip() else None - mistake = result.get("mistake") - mistake = mistake.strip() if isinstance(mistake, str) and mistake.strip() else None - - return { - "verb": verb, - "object": obj, - "arm": arm, - "grasp_type": grasp, - "destination": destination, - "mistake": mistake, - } - # ------------------------------------------------------------------ # Structured 5-axis task augmentation (EgoMimic-style taxonomy) # ------------------------------------------------------------------ diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt deleted file mode 100644 index 1bd127048..000000000 --- a/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt +++ /dev/null @@ -1,64 +0,0 @@ -You are extracting a structured action record from a subtask span of a -teleoperated robot demonstration. This is Phase 1a of a two-step -process: you extract a typed record; a deterministic template then -renders it back to canonical subtask text. Your job is the PERCEPTION -step — not the language step. - -The user originally asked: "{episode_task}" -The subtask span is: "{subtask_text}" -Span time window: [{start_time:.2f}s, {end_time:.2f}s] - ({duration:.2f}s of robot activity) - -You are shown {n_frames} frames sampled uniformly from the subtask -window. Fill in a structured record describing the action that takes -place between the first and last frame. - -Hard rules: -- Use ONLY information visible in the frames. Do not infer details from - outside the span. Do not extrapolate from the original task wording. -- Use canonical object names from the original task VERBATIM. Never - introduce synonyms: if the task says "cube", the record says "cube", - never "block" / "object" / "item". -- For non-applicable fields, use ``null`` (not "n/a", not "none", not - an empty string). -- For ``verb`` and ``grasp_type``, pick EXACTLY one value from the - vocabulary below. Never invent a new one. - -Field schema: - - verb (required) — the imperative verb of the action. Vocabulary: - {verb_vocabulary} - - object (required) — the manipulated object. Use the canonical noun - from the original task above. - - arm — which arm performs the action. One of: - "left" | "right" | "both" | null - Use ``null`` when the source robot is single-arm or when the arm - is genuinely not visible in the frames. - - grasp_type — which grip the gripper uses on contact. One of: - {grasp_vocabulary} | null - Use ``null`` when there is no contact in this span (e.g. a pure - ``move`` / ``reach`` subtask) or the grip is genuinely unclear. - - destination — the target location for actions like ``place``, - ``move``, ``insert``, ``pour``. Use canonical names from the - original task. Use ``null`` for in-place actions (``press``, - ``turn``, ``grasp``, ``release``). - - mistake — a brief one-clause description of any visible failure or - recovery during the span (e.g. "dropped the cube and re-grasped", - "missed the target on first attempt"). Use ``null`` when the span - completes cleanly with no visible recovery. - -Output strictly valid JSON of shape: - - {{ - "verb": "", - "object": "", - "arm": "left" | "right" | "both" | null, - "grasp_type": "" | null, - "destination": "" | null, - "mistake": "" | null - }} diff --git a/src/lerobot/datasets/language.py b/src/lerobot/datasets/language.py index f3d371545..124c25221 100644 --- a/src/lerobot/datasets/language.py +++ b/src/lerobot/datasets/language.py @@ -36,7 +36,6 @@ CORE_STYLES = { "vqa", "trace", "task_aug", - "action_record", } # Project-local styles can be registered at import time by appending to # ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added @@ -47,7 +46,7 @@ CORE_STYLES = { EXTENDED_STYLES: set[str] = set() STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES -PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} +PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"} EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"} # Styles whose ``content`` is grounded in a specific camera view. Rows of these diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py index 2846dab1d..52c7b3708 100644 --- a/tests/datasets/test_language.py +++ b/tests/datasets/test_language.py @@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog): def test_style_registry_routes_columns(): - assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES + assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY