mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-18 16:57:12 +00:00
annotate: remove action-record subtask-text replacement entirely
Drops the replace_subtask_text option and the
_render_action_record_to_subtask_text renderer. Action records are now
strictly additive: when action_records.enabled=True the module emits
style='action_record' rows (the typed {verb,object,arm,grasp,dest,
mistake} schema) and NEVER rewrites the subtask text the policy
conditions on.
The render-back-to-text path was the source of corrupted subtasks
(navigation tasks produced 'move stove to stove', manipulation tasks
got spurious 'with left arm using pinch grip' suffixes). Reconstructing
natural-language subtasks from hallucinated structured fields is
inherently fragile, so the capability is removed rather than guarded.
Removed:
* ActionRecordsConfig.replace_subtask_text field
* PlanSubtasksMemoryModule._render_action_record_to_subtask_text
* the span['text'] = canonical_text overwrite in run_episode
Updated docstrings + run_hf_job.py comment accordingly. emit_record_row
(default True) is now the feature's only output.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -71,10 +71,10 @@ CMD = (
|
||||
"--plan.n_task_rephrasings=0 "
|
||||
# action_records OFF: the structured {verb,object,arm,grasp,dest}
|
||||
# schema is a manipulation schema; RoboCasa navigation / atomic tasks
|
||||
# don't fit it and the VLM hallucinates (e.g. "move stove to stove").
|
||||
# Leave off unless annotating long composite manipulation tasks you've
|
||||
# verified render cleanly (and even then replace_subtask_text stays
|
||||
# off by default so records are additive, never overwriting subtasks).
|
||||
# don't fit it and the VLM hallucinates. When on, records are purely
|
||||
# additive (emitted as style="action_record" rows) and never touch
|
||||
# the subtask text — useful only for long composite manipulation
|
||||
# tasks. Leave off for RoboCasa atomic / navigation.
|
||||
# Keep subtask decomposition tight for atomic tasks:
|
||||
"--plan.plan_max_steps=6 "
|
||||
# Phase 2 — interjections + speech.
|
||||
|
||||
@@ -92,20 +92,16 @@ class ActionRecordsConfig:
|
||||
"mistake": "<short text>" | null,
|
||||
}
|
||||
|
||||
A deterministic Python template then renders the record back to
|
||||
canonical subtask text (e.g. ``pick blue cube with left arm using
|
||||
pinch grip``). When ``replace_subtask_text=True``, the rendered text
|
||||
REPLACES the VLM's free-form subtask text. This is OFF by default:
|
||||
the structured fields are easy for the VLM to hallucinate on tasks
|
||||
that don't fit the manipulation schema (e.g. navigation tasks yield
|
||||
nonsense like ``move stove to stove``), and silently overwriting the
|
||||
subtask text with a reconstruction is high-risk. Leave it off to keep
|
||||
the original VLM subtask text and treat the record as additive
|
||||
metadata; only flip it on for datasets you've verified render
|
||||
cleanly. When ``emit_record_row=True`` (default), the structured
|
||||
record is also emitted as a row with ``style="action_record"`` so
|
||||
downstream consumers can train on the typed schema directly —
|
||||
without touching the subtask text.
|
||||
The record is emitted as a separate row with ``style="action_record"``
|
||||
(``content=json.dumps(record)``) at the subtask's start timestamp.
|
||||
It is PURELY ADDITIVE — it never touches the VLM's subtask text.
|
||||
Downstream training can consume the typed schema directly (e.g.
|
||||
auxiliary supervision on verb / arm / grasp classification heads)
|
||||
while the subtask string the policy conditions on stays exactly what
|
||||
the subtask module produced. (Reconstructing subtask text from these
|
||||
fields was too easy for the VLM to hallucinate on tasks that don't
|
||||
fit the manipulation schema — navigation tasks yielded nonsense like
|
||||
``move stove to stove`` — so that path was removed.)
|
||||
|
||||
Cost: one extra VLM call per subtask. For an 8-subtask episode this
|
||||
means ~8x more VLM calls in the plan module — still cheap relative
|
||||
@@ -114,18 +110,10 @@ class ActionRecordsConfig:
|
||||
|
||||
enabled: bool = False
|
||||
|
||||
# When True, replace the VLM-generated subtask text with the
|
||||
# deterministic template's rendering of the structured record.
|
||||
# OFF by default — see class docstring. Overwriting good subtask
|
||||
# text with a reconstruction of hallucinated structured fields is
|
||||
# high-risk (navigation / non-manipulation tasks render to
|
||||
# nonsense). Keep records additive (``emit_record_row``) instead.
|
||||
replace_subtask_text: bool = False
|
||||
|
||||
# When True, emit a separate row with ``style="action_record"`` and
|
||||
# ``content=json.dumps(record)`` at the subtask's start timestamp.
|
||||
# Lets downstream training consume the typed schema directly (e.g.
|
||||
# auxiliary supervision on verb/arm/grasp classification heads).
|
||||
# When True (default), emit a separate row with ``style="action_record"``
|
||||
# and ``content=json.dumps(record)`` at the subtask's start timestamp.
|
||||
# This is the only output of the feature — set ``enabled=False`` to
|
||||
# skip the extra VLM calls entirely.
|
||||
emit_record_row: bool = True
|
||||
|
||||
# Frame sampling for the per-subtask VLM call (similar to the
|
||||
|
||||
@@ -124,28 +124,24 @@ class PlanSubtasksMemoryModule:
|
||||
subtask_spans = self._generate_subtasks(record, task=effective_task)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# Phase 1a + 1b: structured per-subtask action records
|
||||
# Phase 1a: structured per-subtask action records (additive)
|
||||
# ----------------------------------------------------------------
|
||||
# When enabled, for every subtask span we ask the VLM for a typed
|
||||
# ActionRecord (verb / object / arm / grasp_type / destination /
|
||||
# mistake). A deterministic Python template renders the record
|
||||
# back to canonical subtask text. The render replaces the
|
||||
# free-form subtask text (cleaner conditioning) and the typed
|
||||
# record is emitted as a separate row for downstream use.
|
||||
# mistake) and emit it as a separate ``style="action_record"``
|
||||
# row for downstream use. This is purely additive — it never
|
||||
# touches the VLM's subtask text (reconstructing subtask text
|
||||
# from these fields was too easy to hallucinate on tasks that
|
||||
# don't fit the manipulation schema).
|
||||
records_cfg = self.config.action_records
|
||||
action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
|
||||
if records_cfg.enabled and subtask_spans:
|
||||
for i, span in enumerate(subtask_spans):
|
||||
rec = self._extract_action_record(record, span, effective_task)
|
||||
if rec is None:
|
||||
continue
|
||||
action_records[i] = rec
|
||||
if records_cfg.replace_subtask_text:
|
||||
canonical_text = self._render_action_record_to_subtask_text(rec)
|
||||
if canonical_text:
|
||||
span["text"] = canonical_text
|
||||
if rec is not None:
|
||||
action_records[i] = rec
|
||||
|
||||
# subtask rows (may now reflect canonical-rendered text)
|
||||
# subtask rows
|
||||
for i, span in enumerate(subtask_spans):
|
||||
rows.append(
|
||||
{
|
||||
@@ -396,60 +392,6 @@ class PlanSubtasksMemoryModule:
|
||||
"mistake": mistake,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _render_action_record_to_subtask_text(record: dict[str, Any]) -> str:
|
||||
"""Deterministic template: ``ActionRecord`` → canonical subtask text.
|
||||
|
||||
Mirrors the authoring guidance in ``module_1_subtasks.txt``:
|
||||
imperative, drop articles / adverbs, use canonical object nouns,
|
||||
append arm / grasp clauses only when present.
|
||||
|
||||
Examples (record → rendered text)::
|
||||
|
||||
{verb=pick, object=blue cube}
|
||||
→ "pick blue cube"
|
||||
{verb=pick, object=blue cube, arm=left, grasp_type=pinch}
|
||||
→ "pick blue cube with left arm using pinch grip"
|
||||
{verb=place, object=blue cube, destination=green box}
|
||||
→ "place blue cube in green box"
|
||||
{verb=move, object=mug, destination=stove}
|
||||
→ "move mug to stove"
|
||||
"""
|
||||
verb = (record.get("verb") or "").strip().lower()
|
||||
obj = (record.get("object") or "").strip()
|
||||
arm = (record.get("arm") or "").strip().lower() if record.get("arm") else ""
|
||||
grasp = (record.get("grasp_type") or "").strip().lower() if record.get("grasp_type") else ""
|
||||
dest = (record.get("destination") or "").strip() if record.get("destination") else ""
|
||||
|
||||
if not verb:
|
||||
return ""
|
||||
|
||||
# Drop a degenerate destination that just echoes the object — the
|
||||
# VLM sometimes fills both with the same noun (e.g. navigation:
|
||||
# ``verb=move object=stove destination=stove`` → "move stove to
|
||||
# stove"). Treat that as "no meaningful destination".
|
||||
if dest and obj and dest.strip().lower() == obj.strip().lower():
|
||||
dest = ""
|
||||
|
||||
parts: list[str] = [verb]
|
||||
if obj:
|
||||
parts.append(obj)
|
||||
if dest:
|
||||
# Pick a sensible preposition per verb family.
|
||||
if verb in {"place", "put", "drop", "insert", "pour", "dump"}:
|
||||
parts.append(f"in {dest}")
|
||||
elif verb in {"move", "transport", "reach", "navigate"}:
|
||||
parts.append(f"to {dest}")
|
||||
else:
|
||||
parts.append(f"at {dest}")
|
||||
if arm == "both":
|
||||
parts.append("with both arms")
|
||||
elif arm in {"left", "right"}:
|
||||
parts.append(f"with {arm} arm")
|
||||
if grasp:
|
||||
parts.append(f"using {grasp} grip")
|
||||
return " ".join(parts)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Structured 5-axis task augmentation (EgoMimic-style taxonomy)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user