mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-18 00:37:10 +00:00
annotate: remove the action_record style/feature entirely
Drop the optional structured per-subtask action records — not a feature
we want to ship.
* language.py: remove 'action_record' from CORE_STYLES + PERSISTENT_STYLES
(and the matching assertion in tests/datasets/test_language.py).
* config.py: delete ActionRecordsConfig (verb/grasp vocabularies,
frames_per_subtask, emit_record_row) and the PlanConfig.action_records
field.
* plan_subtasks_memory.py: delete _extract_action_record and the
run_episode block that emitted style='action_record' rows; drop the
now-unused json / to_image_blocks imports.
* remove the plan_action_record.txt prompt.
* run_hf_job.py: drop the action_records comment.
Verified: 40 tests pass; pre-commit (ruff, mypy, bandit) clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -87,8 +87,6 @@ CMD = (
|
||||
# rephrasings are unused at best and harmful when they drift.
|
||||
"--plan.n_task_rephrasings=0 "
|
||||
# Keep subtask decomposition tight for atomic tasks.
|
||||
# (action_records left off: the {verb,object,arm,grasp,dest} schema is for
|
||||
# long manipulation tasks, not RoboCasa atomic/navigation.)
|
||||
"--plan.plan_max_steps=10 "
|
||||
# Only subtasks + memory — skip the numbered "plan" rows. true re-enables.
|
||||
"--plan.emit_plan=false "
|
||||
|
||||
@@ -75,11 +75,6 @@ class PlanConfig:
|
||||
use_video_url: bool = False
|
||||
use_video_url_fps: float = 1.0
|
||||
|
||||
# Optional structured per-subtask action records (EgoMimic-style). When
|
||||
# enabled, the VLM extracts a typed record per subtask span; see
|
||||
# ``ActionRecordsConfig``. Purely additive — off by default.
|
||||
action_records: ActionRecordsConfig = field(default_factory=lambda: ActionRecordsConfig())
|
||||
|
||||
# Optional 5-axis task-augmentation taxonomy for the t=0 variants
|
||||
# (EgoMimic-style: synonym / omit_arm / omit_orientation /
|
||||
# omit_grasp_method / combined). Replaces the free-form
|
||||
@@ -87,73 +82,6 @@ class PlanConfig:
|
||||
task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionRecordsConfig:
|
||||
"""Structured per-subtask action record extraction.
|
||||
|
||||
When ``enabled=True``, after subtask-span generation the module makes
|
||||
one extra VLM call per subtask to extract a typed record::
|
||||
|
||||
{
|
||||
"verb": "pick" | "place" | "press" | ..., # closed vocabulary
|
||||
"object": "<canonical_object_name>",
|
||||
"arm": "left" | "right" | "both" | null,
|
||||
"grasp_type": "pinch" | "wrap" | "hook" | ... | null,
|
||||
"destination": "<canonical_destination>" | null,
|
||||
"mistake": "<short text>" | null,
|
||||
}
|
||||
|
||||
Emitted as a separate ``style="action_record"`` row at the subtask's
|
||||
start timestamp. PURELY ADDITIVE — it never touches the subtask text,
|
||||
so downstream training can use the typed schema (e.g. auxiliary
|
||||
verb/arm/grasp heads) while the conditioning string stays unchanged.
|
||||
|
||||
Cost: one extra VLM call per subtask (~8x plan-module calls on an
|
||||
8-subtask episode).
|
||||
"""
|
||||
|
||||
enabled: bool = False
|
||||
|
||||
# Emit the ``style="action_record"`` row (JSON content) at the subtask
|
||||
# start — the only output of the feature. ``enabled=False`` skips it.
|
||||
emit_record_row: bool = True
|
||||
|
||||
# Frames sampled from the subtask span for the per-subtask VLM call.
|
||||
frames_per_subtask: int = 4
|
||||
|
||||
# Closed verb vocabulary; the prompt picks exactly one. Override
|
||||
# per-dataset (e.g. door-only manipulation) for a tighter constraint.
|
||||
verb_vocabulary: tuple[str, ...] = (
|
||||
"pick",
|
||||
"place",
|
||||
"push",
|
||||
"pull",
|
||||
"open",
|
||||
"close",
|
||||
"turn",
|
||||
"press",
|
||||
"lift",
|
||||
"insert",
|
||||
"pour",
|
||||
"move",
|
||||
"reach",
|
||||
"grasp",
|
||||
"release",
|
||||
"wipe",
|
||||
"dump",
|
||||
)
|
||||
|
||||
# Closed grasp-type vocabulary (``null`` always allowed). Adjust
|
||||
# per-hardware (e.g. drop ``hook`` / ``key`` for parallel-jaw grippers).
|
||||
grasp_vocabulary: tuple[str, ...] = (
|
||||
"pinch",
|
||||
"wrap",
|
||||
"hook",
|
||||
"key",
|
||||
"lateral",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskAugAxesConfig:
|
||||
"""Structured 5-axis augmentation taxonomy for t=0 task variants.
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass, field
|
||||
@@ -29,7 +28,6 @@ from ..frames import (
|
||||
FrameProvider,
|
||||
VideoFrameProvider,
|
||||
null_provider,
|
||||
to_image_blocks,
|
||||
to_video_block,
|
||||
to_video_url_block,
|
||||
)
|
||||
@@ -84,20 +82,8 @@ class PlanSubtasksMemoryModule:
|
||||
|
||||
subtask_spans = self._generate_subtasks(record, task=effective_task)
|
||||
|
||||
# Phase 1a: optional per-subtask action records. When enabled, emit a
|
||||
# typed ActionRecord (verb/object/arm/grasp_type/destination/mistake)
|
||||
# per span as a separate style="action_record" row. Purely additive —
|
||||
# never touches the subtask text.
|
||||
records_cfg = self.config.action_records
|
||||
action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
|
||||
if records_cfg.enabled and subtask_spans:
|
||||
for i, span in enumerate(subtask_spans):
|
||||
rec = self._extract_action_record(record, span, effective_task)
|
||||
if rec is not None:
|
||||
action_records[i] = rec
|
||||
|
||||
# subtask rows
|
||||
for i, span in enumerate(subtask_spans):
|
||||
for span in subtask_spans:
|
||||
rows.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
@@ -107,16 +93,6 @@ class PlanSubtasksMemoryModule:
|
||||
"tool_calls": None,
|
||||
}
|
||||
)
|
||||
if records_cfg.enabled and records_cfg.emit_record_row and action_records[i] is not None:
|
||||
rows.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": json.dumps(action_records[i], sort_keys=True),
|
||||
"style": "action_record",
|
||||
"timestamp": snap_to_frame(span["start"], record.frame_timestamps),
|
||||
"tool_calls": None,
|
||||
}
|
||||
)
|
||||
# Plan rows at every subtask boundary (incl. t=0). The plan is a
|
||||
# numbered list of still-todo subtasks, so re-emitting at each
|
||||
# boundary makes it shrink as work progresses — ${plan} at frame t is
|
||||
@@ -264,107 +240,6 @@ class PlanSubtasksMemoryModule:
|
||||
out = [item.strip().strip('"').strip("'") for item in raw if isinstance(item, str)]
|
||||
return [s for s in out if s][:n]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 1a + 1b: structured per-subtask action records
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_action_record(
|
||||
self,
|
||||
record: EpisodeRecord,
|
||||
span: dict[str, Any],
|
||||
episode_task: str,
|
||||
) -> dict[str, Any] | None:
|
||||
"""Ask the VLM to extract a typed ``ActionRecord`` from a subtask span.
|
||||
|
||||
Sends ``frames_per_subtask`` frames uniformly sampled from
|
||||
``[span.start, span.end]`` plus the canonical subtask text. The
|
||||
VLM is constrained to verb + grasp vocabularies from the config
|
||||
— invalid values are silently dropped at this layer (the
|
||||
validator catches structural problems pre-write).
|
||||
|
||||
Returns ``None`` when the call fails or the VLM returns something
|
||||
unrecognizable; callers fall back to the free-form subtask text.
|
||||
"""
|
||||
cfg = self.config.action_records
|
||||
start_t = float(span.get("start", 0.0))
|
||||
end_t = float(span.get("end", start_t))
|
||||
duration = max(0.0, end_t - start_t)
|
||||
|
||||
# Uniform timestamps within the span; fall back to a single
|
||||
# center frame for very short spans.
|
||||
n = max(1, int(cfg.frames_per_subtask))
|
||||
if n == 1 or duration <= 0.0:
|
||||
timestamps = [0.5 * (start_t + end_t)]
|
||||
else:
|
||||
step = duration / (n - 1)
|
||||
timestamps = [start_t + i * step for i in range(n)]
|
||||
frames = self.frame_provider.frames_at(record, timestamps)
|
||||
if not frames:
|
||||
logger.debug(
|
||||
"action_record: no frames at span %.2f-%.2f for ep %s; skipping",
|
||||
start_t,
|
||||
end_t,
|
||||
record.episode_index,
|
||||
)
|
||||
return None
|
||||
|
||||
prompt = load_prompt("plan_action_record").format(
|
||||
episode_task=episode_task,
|
||||
subtask_text=span.get("text", ""),
|
||||
start_time=start_t,
|
||||
end_time=end_t,
|
||||
duration=duration,
|
||||
n_frames=len(frames),
|
||||
verb_vocabulary=", ".join(cfg.verb_vocabulary),
|
||||
grasp_vocabulary=" | ".join(f'"{g}"' for g in cfg.grasp_vocabulary),
|
||||
)
|
||||
message = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [*to_image_blocks(frames), {"type": "text", "text": prompt}],
|
||||
}
|
||||
]
|
||||
result = self.vlm.generate_json([message])[0]
|
||||
if not isinstance(result, dict):
|
||||
return None
|
||||
|
||||
# Light validation + normalisation. Verb is required; everything
|
||||
# else may be null. Verb / grasp_type are clamped to the
|
||||
# vocabularies (out-of-vocab → reject or null).
|
||||
verb = (result.get("verb") or "").strip().lower()
|
||||
if not verb or verb not in {v.lower() for v in cfg.verb_vocabulary}:
|
||||
return None
|
||||
obj = (result.get("object") or "").strip()
|
||||
if not obj:
|
||||
return None
|
||||
grasp = result.get("grasp_type")
|
||||
if isinstance(grasp, str):
|
||||
grasp = grasp.strip().lower()
|
||||
if grasp not in {g.lower() for g in cfg.grasp_vocabulary}:
|
||||
grasp = None
|
||||
else:
|
||||
grasp = None
|
||||
arm = result.get("arm")
|
||||
if isinstance(arm, str):
|
||||
arm = arm.strip().lower()
|
||||
if arm not in {"left", "right", "both"}:
|
||||
arm = None
|
||||
else:
|
||||
arm = None
|
||||
destination = result.get("destination")
|
||||
destination = destination.strip() if isinstance(destination, str) and destination.strip() else None
|
||||
mistake = result.get("mistake")
|
||||
mistake = mistake.strip() if isinstance(mistake, str) and mistake.strip() else None
|
||||
|
||||
return {
|
||||
"verb": verb,
|
||||
"object": obj,
|
||||
"arm": arm,
|
||||
"grasp_type": grasp,
|
||||
"destination": destination,
|
||||
"mistake": mistake,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Structured 5-axis task augmentation (EgoMimic-style taxonomy)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
You are extracting a structured action record from a subtask span of a
|
||||
teleoperated robot demonstration. This is Phase 1a of a two-step
|
||||
process: you extract a typed record; a deterministic template then
|
||||
renders it back to canonical subtask text. Your job is the PERCEPTION
|
||||
step — not the language step.
|
||||
|
||||
The user originally asked: "{episode_task}"
|
||||
The subtask span is: "{subtask_text}"
|
||||
Span time window: [{start_time:.2f}s, {end_time:.2f}s]
|
||||
({duration:.2f}s of robot activity)
|
||||
|
||||
You are shown {n_frames} frames sampled uniformly from the subtask
|
||||
window. Fill in a structured record describing the action that takes
|
||||
place between the first and last frame.
|
||||
|
||||
Hard rules:
|
||||
- Use ONLY information visible in the frames. Do not infer details from
|
||||
outside the span. Do not extrapolate from the original task wording.
|
||||
- Use canonical object names from the original task VERBATIM. Never
|
||||
introduce synonyms: if the task says "cube", the record says "cube",
|
||||
never "block" / "object" / "item".
|
||||
- For non-applicable fields, use ``null`` (not "n/a", not "none", not
|
||||
an empty string).
|
||||
- For ``verb`` and ``grasp_type``, pick EXACTLY one value from the
|
||||
vocabulary below. Never invent a new one.
|
||||
|
||||
Field schema:
|
||||
|
||||
verb (required) — the imperative verb of the action. Vocabulary:
|
||||
{verb_vocabulary}
|
||||
|
||||
object (required) — the manipulated object. Use the canonical noun
|
||||
from the original task above.
|
||||
|
||||
arm — which arm performs the action. One of:
|
||||
"left" | "right" | "both" | null
|
||||
Use ``null`` when the source robot is single-arm or when the arm
|
||||
is genuinely not visible in the frames.
|
||||
|
||||
grasp_type — which grip the gripper uses on contact. One of:
|
||||
{grasp_vocabulary} | null
|
||||
Use ``null`` when there is no contact in this span (e.g. a pure
|
||||
``move`` / ``reach`` subtask) or the grip is genuinely unclear.
|
||||
|
||||
destination — the target location for actions like ``place``,
|
||||
``move``, ``insert``, ``pour``. Use canonical names from the
|
||||
original task. Use ``null`` for in-place actions (``press``,
|
||||
``turn``, ``grasp``, ``release``).
|
||||
|
||||
mistake — a brief one-clause description of any visible failure or
|
||||
recovery during the span (e.g. "dropped the cube and re-grasped",
|
||||
"missed the target on first attempt"). Use ``null`` when the span
|
||||
completes cleanly with no visible recovery.
|
||||
|
||||
Output strictly valid JSON of shape:
|
||||
|
||||
{{
|
||||
"verb": "<one of vocabulary>",
|
||||
"object": "<canonical noun>",
|
||||
"arm": "left" | "right" | "both" | null,
|
||||
"grasp_type": "<one of vocabulary>" | null,
|
||||
"destination": "<canonical noun>" | null,
|
||||
"mistake": "<short description>" | null
|
||||
}}
|
||||
@@ -36,7 +36,6 @@ CORE_STYLES = {
|
||||
"vqa",
|
||||
"trace",
|
||||
"task_aug",
|
||||
"action_record",
|
||||
}
|
||||
# Project-local styles can be registered at import time by appending to
|
||||
# ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added
|
||||
@@ -47,7 +46,7 @@ CORE_STYLES = {
|
||||
EXTENDED_STYLES: set[str] = set()
|
||||
STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
|
||||
|
||||
PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug", "action_record"}
|
||||
PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
|
||||
EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
|
||||
|
||||
# Styles whose ``content`` is grounded in a specific camera view. Rows of these
|
||||
|
||||
@@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
|
||||
|
||||
|
||||
def test_style_registry_routes_columns():
|
||||
assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES
|
||||
assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES
|
||||
assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES
|
||||
assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY
|
||||
|
||||
|
||||
Reference in New Issue
Block a user