mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-18 00:37:10 +00:00
annotations(steerable): structured action records + 5-axis task augmentation
EgoMimic-inspired additions to the plan module, both opt-in for back-compat.
1. PHASE 1a + 1b: per-subtask structured action records
* cfg.action_records.enabled=True triggers, after Phase 1 subtask-span
generation, one extra VLM call per subtask to extract a typed record:
{verb, object, arm, grasp_type, destination, mistake}
* A deterministic Python template (_render_action_record_to_subtask_text)
renders the record back to canonical subtask text. When replace_subtask_
text=True (default), this REPLACES the VLM's free-form text — eliminates
cross-episode phrasing drift.
* When emit_record_row=True (default), the structured record is also
emitted as a row with style='action_record' (added to PERSISTENT_STYLES)
so downstream training can consume the typed schema directly.
* Verb + grasp vocabularies are configurable. Out-of-vocab values are
rejected at extraction time.
2. STRUCTURED 5-AXIS TASK AUGMENTATION
* cfg.task_aug_axes.enabled=True replaces the free-form n_task_rephrasings
path with a structured prompt producing variants along 5 named axes:
synonym_paraphrase (3)
omit_arm (3)
omit_orientation (2)
omit_grasp_method (2)
combined_omissions (2)
Total ~12 variants. Axes with nothing to omit emit fewer entries.
* Each variant is emitted as a task_aug row at t=0 (existing style).
Inspired by https://github.com/GaTech-RL2/EgoVerse/tree/main/egomimic/scripts/language_process
— they pay Scale AI annotators to fill a structured form and then generate
language via a deterministic prompt. We get the same hallucination-reducing
structure via one extra VLM call per subtask.
Files:
src/lerobot/datasets/language.py
src/lerobot/annotations/steerable_pipeline/config.py
src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Emit the ``--dataset.episodes`` include-list for a LeRobot dataset, minus a
|
||||
set of excluded episode indices.
|
||||
|
||||
``LeRobotDatasetConfig.episodes`` is an *include* list (train only on the listed
|
||||
episodes), so "exclude episode X" means "pass every episode except X". This
|
||||
helper builds that complement.
|
||||
|
||||
For ``pepijn223/robocasa_pretrain_human300_v4`` the default exclusion set is the
|
||||
63 episodes that carry NO ``subtask`` annotation (in fact no persistent language
|
||||
rows at all) — see the scan in this PR's discussion. Training the steerable
|
||||
SmolVLA/pi052 policy on those episodes would feed it frames with empty subtask
|
||||
targets, so we drop them.
|
||||
|
||||
Usage (prints a compact ``[0,1,2,...]`` list to stdout, logs to stderr):
|
||||
|
||||
python scripts/build_episode_filter.py \
|
||||
--repo-id pepijn223/robocasa_pretrain_human300_v4
|
||||
|
||||
# capture in a shell script
|
||||
EPISODES=$(python scripts/build_episode_filter.py --repo-id <id>)
|
||||
lerobot-train ... --dataset.episodes="$EPISODES"
|
||||
|
||||
The helper reads ``meta/info.json`` from the Hub to learn ``total_episodes`` and
|
||||
validates that every excluded index is in ``[0, total_episodes)`` before emitting
|
||||
the complement. Pass ``--no-validate-hub`` to skip the network round-trip and use
|
||||
``--total-episodes`` directly (e.g. for an offline / local dataset).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
# Episodes in pepijn223/robocasa_pretrain_human300_v4 with no `subtask`
|
||||
# annotation (no persistent language rows at all). 63 episodes / 179,009 frames.
|
||||
DEFAULT_EXCLUDE: tuple[int, ...] = (
|
||||
1065, 2972, 6971, 8129, 9167, 9170, 9171, 9177, 9190, 9196, 9199, 9204,
|
||||
9207, 9208, 9210, 9217, 9232, 9234, 9240, 9243, 9254, 9256, 9258, 9259,
|
||||
9261, 9263, 9264, 15928, 16350, 18729, 20026, 21703, 25314, 25319, 25321,
|
||||
25324, 25333, 25340, 25356, 25366, 25374, 25388, 25392, 25825, 25893,
|
||||
26347, 26357, 26374, 26375, 26388, 26394, 26398, 26400, 26409, 26422,
|
||||
26423, 26426, 26895, 26905, 26915, 26954, 27064, 30812,
|
||||
)
|
||||
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _total_episodes_from_hub(repo_id: str, revision: str | None) -> int:
|
||||
"""Return ``total_episodes`` from the dataset's ``meta/info.json`` on the Hub."""
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
path = hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename="meta/info.json",
|
||||
repo_type="dataset",
|
||||
revision=revision,
|
||||
)
|
||||
with open(path) as f:
|
||||
info = json.load(f)
|
||||
total = int(info["total_episodes"])
|
||||
if total <= 0:
|
||||
raise ValueError(f"info.json reports non-positive total_episodes={total!r}")
|
||||
return total
|
||||
|
||||
|
||||
def build_include_list(total_episodes: int, exclude: set[int]) -> list[int]:
|
||||
"""Return ``[0, total_episodes)`` with ``exclude`` removed, ascending."""
|
||||
out_of_range = sorted(e for e in exclude if e < 0 or e >= total_episodes)
|
||||
if out_of_range:
|
||||
raise ValueError(
|
||||
f"{len(out_of_range)} excluded index(es) outside [0, {total_episodes}): "
|
||||
f"{out_of_range[:10]}{'...' if len(out_of_range) > 10 else ''}. "
|
||||
"The dataset may have changed — re-run the subtask scan before training."
|
||||
)
|
||||
return [e for e in range(total_episodes) if e not in exclude]
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
p.add_argument("--repo-id", default="pepijn223/robocasa_pretrain_human300_v4")
|
||||
p.add_argument("--revision", default=None, help="Dataset revision/branch (default: main).")
|
||||
p.add_argument(
|
||||
"--exclude-file",
|
||||
default=None,
|
||||
help="Optional JSON file with a list of episode indices to exclude. "
|
||||
"Overrides the built-in default set.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--total-episodes",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total episode count. If omitted, read from meta/info.json on the Hub.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-validate-hub",
|
||||
action="store_true",
|
||||
help="Do not fetch info.json from the Hub; requires --total-episodes.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--out",
|
||||
default=None,
|
||||
help="Write the list to this file instead of stdout.",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
if args.exclude_file:
|
||||
with open(args.exclude_file) as f:
|
||||
data = json.load(f)
|
||||
# Accept either a bare list or the {"missing_episode_indices": [...]} report shape.
|
||||
exclude = set(data["missing_episode_indices"] if isinstance(data, dict) else data)
|
||||
else:
|
||||
exclude = set(DEFAULT_EXCLUDE)
|
||||
|
||||
if args.total_episodes is not None:
|
||||
total = args.total_episodes
|
||||
if not args.no_validate_hub:
|
||||
hub_total = _total_episodes_from_hub(args.repo_id, args.revision)
|
||||
if hub_total != total:
|
||||
raise ValueError(
|
||||
f"--total-episodes={total} disagrees with Hub info.json total_episodes={hub_total}."
|
||||
)
|
||||
else:
|
||||
if args.no_validate_hub:
|
||||
raise SystemExit("--no-validate-hub requires --total-episodes.")
|
||||
total = _total_episodes_from_hub(args.repo_id, args.revision)
|
||||
|
||||
include = build_include_list(total, exclude)
|
||||
_log(
|
||||
f"[build_episode_filter] repo={args.repo_id} total={total} "
|
||||
f"excluded={len(exclude)} kept={len(include)}"
|
||||
)
|
||||
|
||||
# Compact JSON (no spaces) so the resulting CLI arg stays as short as possible.
|
||||
payload = "[" + ",".join(map(str, include)) + "]"
|
||||
if args.out:
|
||||
with open(args.out, "w") as f:
|
||||
f.write(payload)
|
||||
_log(f"[build_episode_filter] wrote {len(payload)} bytes to {args.out}")
|
||||
else:
|
||||
sys.stdout.write(payload)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
+115
@@ -0,0 +1,115 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=pi052-hirobot-robocasa-human300
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=48:00:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=8
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export LEROBOT_DEBUG_PREDS_EVERY=1000
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export NCCL_TIMEOUT="${NCCL_TIMEOUT:-1800}"
|
||||
export HF_HUB_DOWNLOAD_TIMEOUT="${HF_HUB_DOWNLOAD_TIMEOUT:-120}"
|
||||
export WANDB_INIT_TIMEOUT="${WANDB_INIT_TIMEOUT:-300}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
# Compile path: pin triton + inductor caches node-local. The shared
|
||||
# /fsx cache mixes kernels built against different glibc versions and
|
||||
# trips ``GLIBC_2.34 not found`` on hopper nodes (bench v3 confirmed).
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
# Non-fatal so an unstaged local hotfix doesn't kill the job. CI / clean
|
||||
# checkouts still fast-forward as before; dirty trees just keep their
|
||||
# in-flight changes (the working tree is what runs).
|
||||
git pull --ff-only || echo "[warn] git pull skipped — keeping working tree."
|
||||
python -m pip install -q --upgrade -e .
|
||||
python -m pip install -q --upgrade -e '.[pi]'
|
||||
python -m pip install -q --upgrade 'liger-kernel'
|
||||
|
||||
# FlashAttention-2 is NOT installed. The pi052 dual-expert layer compute
|
||||
# uses SDPA (the block-bidirectional mask is unsupported by FA2 anyway),
|
||||
# and the only other consumer would be liger-kernel — which gracefully
|
||||
# degrades when flash_attn is absent. The previously-installed wheel was
|
||||
# built against a newer GLIBC than some hopper compute nodes provide
|
||||
# (job 22162586 on ip-26-0-162-14 hit ``GLIBC_2.32 not found``), so the
|
||||
# safest configuration is "not installed". To re-enable for the
|
||||
# downstream HF Gemma ``generate`` path, install a wheel matching the
|
||||
# node's libc — but verify on every assigned node first.
|
||||
|
||||
DATASET="pepijn223/robocasa_pretrain_human300_v4"
|
||||
DATASET_REVISION="${DATASET_REVISION:-main}"
|
||||
POLICY_REPO_ID="pepijn223/pi052_robocasa_human300"
|
||||
JOB_NAME="pi052-hirobot-robocasa-human300"
|
||||
NUM_PROCESSES=8
|
||||
# BS=36 — fits ~72 GB / 80 GB, BS=36 × 8 GPUs = 288 effective.
|
||||
BATCH_SIZE=${BATCH_SIZE:-36}
|
||||
STEPS=${STEPS:-5000}
|
||||
RUN_ID="${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}"
|
||||
OUTPUT_DIR="/fsx/pepijn/outputs/train/pi052_robocasa_human300_${RUN_ID}"
|
||||
|
||||
# --- Exclude un-annotated episodes -----------------------------------------
|
||||
# 63 episodes in this dataset carry NO `subtask` annotation (no persistent
|
||||
# language rows at all). `--dataset.episodes` is an INCLUDE list, so we pass
|
||||
# the complement: every episode index except those 63. The helper reads
|
||||
# meta/info.json from the Hub to confirm total_episodes (32043) and validates
|
||||
# the excluded indices are in range before emitting the list. If the dataset
|
||||
# version changes such that the indices fall out of range, the helper aborts
|
||||
# the job rather than silently training on the wrong episodes.
|
||||
echo "Building episode include-list (excluding un-annotated episodes)..."
|
||||
EPISODES=$(python scripts/build_episode_filter.py \
|
||||
--repo-id "$DATASET" \
|
||||
--revision "$DATASET_REVISION")
|
||||
|
||||
echo "Training pi052 on $DATASET with ${NUM_PROCESSES} GPUs, batch size ${BATCH_SIZE}/GPU, ${STEPS} steps"
|
||||
echo "Output directory: $OUTPUT_DIR"
|
||||
export LEROBOT_DUMP_RECIPE_SAMPLES=8
|
||||
|
||||
accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
||||
-m lerobot.scripts.lerobot_train \
|
||||
--policy.type=pi052 \
|
||||
--policy.pretrained_path=lerobot/pi05_base \
|
||||
--policy.recipe_path=recipes/subtask_mem_vqa_robocasa.yaml \
|
||||
--dataset.repo_id="$DATASET" \
|
||||
--dataset.revision="$DATASET_REVISION" \
|
||||
--dataset.episodes="$EPISODES" \
|
||||
--dataset.video_backend=pyav \
|
||||
--output_dir="$OUTPUT_DIR" \
|
||||
--job_name="$JOB_NAME" \
|
||||
--policy.repo_id="$POLICY_REPO_ID" \
|
||||
--policy.compile_model=true \
|
||||
--policy.compile_mode=default \
|
||||
--policy.gradient_checkpointing=true \
|
||||
--policy.device=cuda \
|
||||
--policy.tokenizer_max_length=256 \
|
||||
--policy.action_tokenizer_name=lerobot/fast-action-tokenizer \
|
||||
--policy.chunk_size=30 \
|
||||
--policy.n_action_steps=30 \
|
||||
--policy.max_action_tokens=256 \
|
||||
--steps="$STEPS" \
|
||||
--policy.scheduler_decay_steps="$STEPS" \
|
||||
--batch_size="$BATCH_SIZE" \
|
||||
--wandb.enable=true \
|
||||
--policy.dtype=bfloat16 \
|
||||
--policy.optimizer_lr=5e-5 \
|
||||
--policy.optimizer_grad_clip_norm=1.0 \
|
||||
--policy.scheduler_decay_lr=5e-6 \
|
||||
--policy.lm_head_lr_scale=5.0 \
|
||||
--ema.enable=true \
|
||||
--wandb.disable_artifact=true \
|
||||
--wandb.project=hirobot \
|
||||
--log_freq=100 \
|
||||
--save_freq=5000 \
|
||||
--num_workers=4 \
|
||||
--prefetch_factor=4 \
|
||||
--persistent_workers=true \
|
||||
--dataset.image_transforms.enable=true \
|
||||
--dataset.image_transforms.max_num_transforms=3 \
|
||||
--dataset.image_transforms.random_order=true \
|
||||
--policy.auto_fit_fast_tokenizer=true \
|
||||
--policy.knowledge_insulation=true
|
||||
Reference in New Issue
Block a user