mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 22:59:50 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2ab59a3099 | |||
| dab511dbb1 | |||
| fd00e38851 |
@@ -310,3 +310,181 @@ jobs:
|
||||
name: metaworld-metrics
|
||||
path: /tmp/metaworld-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── LIBERO-plus ───────────────────────────────────────────────────────────
|
||||
libero-plus-integration-test:
|
||||
name: LIBERO-plus — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Build LIBERO-plus benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.libero_plus
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-libero-plus:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
|
||||
|
||||
- name: Run LIBERO-plus smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name libero-plus-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
lerobot-benchmark-libero-plus:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_libero_plus \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial \
|
||||
'--env.task_ids=[0,100,260,500,1000,1500,2000,2400]' \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
|
||||
--policy.empty_cameras=1 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env libero_plus --task libero_spatial \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy LIBERO-plus artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/libero-plus-artifacts
|
||||
docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
|
||||
docker rm -f libero-plus-eval || true
|
||||
|
||||
- name: Parse LIBERO-plus eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/libero-plus-artifacts \
|
||||
--env libero_plus \
|
||||
--task libero_spatial \
|
||||
--policy lerobot/smolvla_libero_plus
|
||||
|
||||
- name: Upload LIBERO-plus rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-rollout-video
|
||||
path: /tmp/libero-plus-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload LIBERO-plus eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-metrics
|
||||
path: /tmp/libero-plus-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOMME ───────────────────────────────────────────────────────────────
|
||||
robomme-integration-test:
|
||||
name: RoboMME — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Build RoboMME benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robomme
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robomme:ci
|
||||
|
||||
- name: Run RoboMME smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robomme-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
lerobot-benchmark-robomme:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robomme \
|
||||
--env.type=robomme \
|
||||
--env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
|
||||
--env.dataset_split=test \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
|
||||
--policy.empty_cameras=3 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env robomme --task PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboMME artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robomme-artifacts
|
||||
docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robomme-eval || true
|
||||
|
||||
- name: Parse RoboMME eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robomme-artifacts \
|
||||
--env robomme \
|
||||
--task PickXtimes \
|
||||
--policy lerobot/smolvla_robomme
|
||||
|
||||
- name: Upload RoboMME rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-rollout-video
|
||||
path: /tmp/robomme-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboMME eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-metrics
|
||||
path: /tmp/robomme-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
@@ -0,0 +1,60 @@
|
||||
# LeRobot LIBERO Training Benchmark
|
||||
|
||||
Train and evaluate all LeRobot policies on [LIBERO](https://libero-project.github.io/) and publish results as a HuggingFace leaderboard dataset.
|
||||
|
||||
## Policies
|
||||
|
||||
| Policy | Base Model | GPUs | LR | Chunk | Notes |
|
||||
| -------------- | -------------------- | ---- | ------ | ----- | ------------------------------------- |
|
||||
| pi0 | lerobot/pi0_base | 8 | 2.5e-5 | 30 | PaliGemma + Gemma flow matching |
|
||||
| pi0_fast | lerobot/pi0fast-base | 8 | 2.5e-5 | 30 | Requires tokenizer pre-training |
|
||||
| pi05 | lerobot/pi05_base | 8 | 2.5e-5 | 30 | Quantiles normalization |
|
||||
| groot | nvidia/GR00T-N1.5-3B | 8 | 1e-4 | 30 | bf16, diffusion head + projector only |
|
||||
| act | From scratch | 1 | 1e-5 | 30 | ResNet-18, lightweight |
|
||||
| diffusion | From scratch | 1 | 1e-4 | 32\* | U-Net, horizon must be divisible by 8 |
|
||||
| smolvla | lerobot/smolvla_base | 8 | 1e-4 | 30 | SmolVLM2-500M |
|
||||
| xvla | lerobot/xvla-widowx | 4 | 1e-4 | 32\* | Florence2 + CLIP |
|
||||
| multi_task_dit | From scratch | 1 | 2e-5 | 32\* | CLIP + DiT |
|
||||
|
||||
\* These policies use `horizon` rather than `chunk_size`. Set to 32 (nearest valid value to 30).
|
||||
|
||||
## Training spec
|
||||
|
||||
- **Steps**: 5,000 per policy
|
||||
- **Batch size**: 32 per GPU (effective BS = 256 for multi-GPU)
|
||||
- **Dataset**: `lerobot/libero` (libero_spatial)
|
||||
- **Evaluation**: 20 episodes after training
|
||||
- **LR**: each policy's default optimizer/scheduler preset
|
||||
- **Results**: each SLURM job publishes its own row to the HF leaderboard dataset automatically
|
||||
|
||||
## Quick start
|
||||
|
||||
### 1. Generate SLURM scripts
|
||||
|
||||
```bash
|
||||
python benchmarks/libero/run_benchmark.py \
|
||||
--output_dir /scratch/lerobot-benchmark \
|
||||
--hub_org lerobot
|
||||
```
|
||||
|
||||
### 2. Submit jobs
|
||||
|
||||
```bash
|
||||
# If using pi0_fast, submit tokenizer first:
|
||||
sbatch /scratch/lerobot-benchmark/slurm_scripts/00_tokenizer.sh
|
||||
# Wait, then submit pi0_fast
|
||||
|
||||
# All other policies can run in parallel:
|
||||
for script in /scratch/lerobot-benchmark/slurm_scripts/[0-9][0-9]_*.sh; do
|
||||
[[ "$script" == *pi0_fast* ]] && continue
|
||||
sbatch "$script"
|
||||
done
|
||||
```
|
||||
|
||||
Each job publishes its result to `lerobot/benchmark-libero` on the Hub when it finishes.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- SLURM cluster with CUDA GPUs (A100 80GB recommended for VLM policies)
|
||||
- `pip install lerobot[pi,smolvla,groot,xvla,multi_task_dit,libero] datasets`
|
||||
- `huggingface-cli login`
|
||||
@@ -0,0 +1,606 @@
|
||||
#!/usr/bin/env python
|
||||
"""Generate SLURM sbatch scripts for training all LeRobot policies on LIBERO.
|
||||
|
||||
Each generated script trains one policy, evaluates it, and publishes its
|
||||
results row to a HuggingFace leaderboard dataset — no separate collection
|
||||
step needed.
|
||||
|
||||
Usage:
|
||||
# Generate scripts for all policies:
|
||||
python benchmarks/libero/run_benchmark.py \\
|
||||
--output_dir /scratch/lerobot-benchmark --hub_org lerobot
|
||||
|
||||
# Generate for a subset:
|
||||
python benchmarks/libero/run_benchmark.py \\
|
||||
--policies pi0 smolvla act \\
|
||||
--output_dir /scratch/lerobot-benchmark --hub_org lerobot
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import textwrap
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Policy benchmark configs
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class PolicyBenchmarkConfig:
|
||||
"""Training configuration for a single policy on a benchmark."""
|
||||
|
||||
policy_type: str
|
||||
policy_path: str | None = None
|
||||
num_gpus: int = 1
|
||||
chunk_size: int | None = None # Set on policies that use chunk_size (not horizon)
|
||||
extra_policy_args: dict[str, str] = field(default_factory=dict)
|
||||
needs_tokenizer: bool = False
|
||||
tokenizer_args: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
COMMON_TRAINING_ARGS: dict[str, str] = {
|
||||
"dataset.repo_id": "lerobot/libero",
|
||||
"dataset.use_imagenet_stats": "false",
|
||||
"env.type": "libero",
|
||||
"env.task": "libero_spatial",
|
||||
"steps": "5000",
|
||||
"batch_size": "32",
|
||||
"eval_freq": "0",
|
||||
"save_freq": "5000",
|
||||
"save_checkpoint": "true",
|
||||
"log_freq": "100",
|
||||
"wandb.enable": "true",
|
||||
"policy.push_to_hub": "true",
|
||||
"rename_map": (
|
||||
'{"observation.images.image":"observation.images.camera1",'
|
||||
'"observation.images.image2":"observation.images.camera2"}'
|
||||
),
|
||||
}
|
||||
|
||||
EVAL_ARGS: dict[str, str] = {
|
||||
"env.type": "libero",
|
||||
"env.task": "libero_spatial",
|
||||
"eval.n_episodes": "20",
|
||||
"eval.batch_size": "10",
|
||||
}
|
||||
|
||||
POLICY_CONFIGS: dict[str, PolicyBenchmarkConfig] = {
|
||||
"pi0": PolicyBenchmarkConfig(
|
||||
policy_type="pi0",
|
||||
policy_path="lerobot/pi0_base",
|
||||
num_gpus=8,
|
||||
chunk_size=30,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.scheduler_decay_steps": "5000",
|
||||
},
|
||||
),
|
||||
"pi0_fast": PolicyBenchmarkConfig(
|
||||
policy_type="pi0_fast",
|
||||
policy_path="lerobot/pi0fast-base",
|
||||
num_gpus=8,
|
||||
chunk_size=30,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.scheduler_decay_steps": "5000",
|
||||
},
|
||||
needs_tokenizer=True,
|
||||
tokenizer_args={
|
||||
"repo_id": "lerobot/libero",
|
||||
"action_horizon": "30",
|
||||
"encoded_dims": "0:7",
|
||||
"normalization_mode": "QUANTILES",
|
||||
"vocab_size": "1024",
|
||||
"scale": "10.0",
|
||||
"push_to_hub": "true",
|
||||
},
|
||||
),
|
||||
"pi05": PolicyBenchmarkConfig(
|
||||
policy_type="pi05",
|
||||
policy_path="lerobot/pi05_base",
|
||||
num_gpus=8,
|
||||
chunk_size=30,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.scheduler_decay_steps": "5000",
|
||||
},
|
||||
),
|
||||
"groot": PolicyBenchmarkConfig(
|
||||
policy_type="groot",
|
||||
policy_path=None,
|
||||
num_gpus=8,
|
||||
chunk_size=30,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.base_model_path": "nvidia/GR00T-N1.5-3B",
|
||||
"policy.tune_diffusion_model": "true",
|
||||
"policy.tune_projector": "true",
|
||||
"policy.tune_llm": "false",
|
||||
"policy.tune_visual": "false",
|
||||
"policy.use_bf16": "true",
|
||||
},
|
||||
),
|
||||
"act": PolicyBenchmarkConfig(
|
||||
policy_type="act",
|
||||
policy_path=None,
|
||||
num_gpus=1,
|
||||
chunk_size=30,
|
||||
extra_policy_args={"policy.n_action_steps": "30"},
|
||||
),
|
||||
"diffusion": PolicyBenchmarkConfig(
|
||||
policy_type="diffusion",
|
||||
policy_path=None,
|
||||
num_gpus=1,
|
||||
chunk_size=None,
|
||||
extra_policy_args={
|
||||
"policy.horizon": "32",
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.n_obs_steps": "2",
|
||||
},
|
||||
),
|
||||
"smolvla": PolicyBenchmarkConfig(
|
||||
policy_type="smolvla",
|
||||
policy_path="lerobot/smolvla_base",
|
||||
num_gpus=8,
|
||||
chunk_size=30,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "30",
|
||||
"policy.load_vlm_weights": "true",
|
||||
"policy.freeze_vision_encoder": "false",
|
||||
"policy.train_expert_only": "false",
|
||||
"policy.scheduler_decay_steps": "5000",
|
||||
},
|
||||
),
|
||||
"xvla": PolicyBenchmarkConfig(
|
||||
policy_type="xvla",
|
||||
policy_path="lerobot/xvla-widowx",
|
||||
num_gpus=4,
|
||||
chunk_size=32,
|
||||
extra_policy_args={
|
||||
"policy.n_action_steps": "32",
|
||||
"policy.scheduler_decay_steps": "5000",
|
||||
},
|
||||
),
|
||||
"multi_task_dit": PolicyBenchmarkConfig(
|
||||
policy_type="multi_task_dit",
|
||||
policy_path=None,
|
||||
num_gpus=1,
|
||||
chunk_size=None,
|
||||
extra_policy_args={
|
||||
"policy.horizon": "32",
|
||||
"policy.n_action_steps": "30",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
ALL_POLICY_NAMES = list(POLICY_CONFIGS.keys())
|
||||
|
||||
# GPU memory estimates (GB) for SLURM --mem allocation
|
||||
GPU_MEM_ESTIMATES: dict[str, int] = {
|
||||
"pi0": 320,
|
||||
"pi0_fast": 320,
|
||||
"pi05": 280,
|
||||
"groot": 320,
|
||||
"act": 64,
|
||||
"diffusion": 64,
|
||||
"smolvla": 160,
|
||||
"xvla": 160,
|
||||
"multi_task_dit": 64,
|
||||
}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# SLURM script generation
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _cli_args(args: dict[str, str]) -> str:
|
||||
"""Build a backslash-continued CLI arg string with proper shell quoting."""
|
||||
lines = []
|
||||
for key, value in args.items():
|
||||
if any(c in str(value) for c in ["{", "}", " ", '"', "'"]):
|
||||
lines.append(f" --{key}='{value}'")
|
||||
else:
|
||||
lines.append(f" --{key}={value}")
|
||||
return " \\\n".join(lines)
|
||||
|
||||
|
||||
def _training_cli_args(
|
||||
policy_name: str,
|
||||
output_dir: Path,
|
||||
hub_org: str,
|
||||
benchmark_uuid: str,
|
||||
) -> str:
|
||||
cfg = POLICY_CONFIGS[policy_name]
|
||||
args: dict[str, str] = {}
|
||||
args.update(COMMON_TRAINING_ARGS)
|
||||
args["policy.type"] = cfg.policy_type
|
||||
if cfg.policy_path:
|
||||
args["policy.path"] = cfg.policy_path
|
||||
if cfg.chunk_size is not None:
|
||||
args["policy.chunk_size"] = str(cfg.chunk_size)
|
||||
args.update(cfg.extra_policy_args)
|
||||
args["output_dir"] = str(output_dir / "train" / policy_name)
|
||||
args["policy.repo_id"] = f"{hub_org}/{policy_name}_libero"
|
||||
args["wandb.project"] = "lerobot-libero-benchmark"
|
||||
args["wandb.run_name"] = f"{policy_name}_{benchmark_uuid[:8]}"
|
||||
return _cli_args(args)
|
||||
|
||||
|
||||
def _publish_snippet(
|
||||
policy_name: str,
|
||||
output_dir: Path,
|
||||
hub_org: str,
|
||||
benchmark_uuid: str,
|
||||
hub_dataset: str,
|
||||
) -> str:
|
||||
"""Inline Python that each SLURM job runs to publish its own result row."""
|
||||
cfg = POLICY_CONFIGS[policy_name]
|
||||
steps = int(COMMON_TRAINING_ARGS["steps"])
|
||||
bs = int(COMMON_TRAINING_ARGS["batch_size"])
|
||||
eff_bs = bs * cfg.num_gpus
|
||||
train_dir = output_dir / "train" / policy_name
|
||||
|
||||
return textwrap.dedent(f"""\
|
||||
python3 -c "
|
||||
import json, os, re, sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
|
||||
timing = {{}}
|
||||
tp = Path('{output_dir}/logs/{policy_name}_timing.txt')
|
||||
if tp.exists():
|
||||
for ln in tp.read_text().splitlines():
|
||||
if '=' in ln:
|
||||
k, _, v = ln.partition('=')
|
||||
timing[k.strip()] = v.strip()
|
||||
|
||||
# Parse eval results
|
||||
eval_sr, eval_per_task, eval_n = None, '{{}}', 0
|
||||
eval_dir = Path('{train_dir}/eval_results')
|
||||
if eval_dir.exists():
|
||||
for jf in eval_dir.glob('**/*.json'):
|
||||
try:
|
||||
d = json.loads(jf.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
if 'avg_success_rate' in d:
|
||||
eval_sr = d['avg_success_rate']
|
||||
elif 'eval_info' in d and 'avg_success_rate' in d.get('eval_info', {{}}):
|
||||
eval_sr = d['eval_info']['avg_success_rate']
|
||||
pt = {{k: v for k, v in d.items() if 'success_rate' in k and k != 'avg_success_rate'}}
|
||||
if pt:
|
||||
eval_per_task = json.dumps(pt)
|
||||
if 'n_episodes' in d:
|
||||
eval_n = d['n_episodes']
|
||||
|
||||
# Parse final loss from SLURM stdout
|
||||
final_loss = None
|
||||
for lf in sorted(Path('{output_dir}/logs').glob('{policy_name}_*.out'), reverse=True):
|
||||
losses = re.findall(r'\\\"loss\\\"\\s*:\\s*([\\d.e+-]+)', lf.read_text())
|
||||
if losses:
|
||||
final_loss = float(losses[-1])
|
||||
break
|
||||
|
||||
# Parse peak GPU mem
|
||||
peak_mem = 0.0
|
||||
csv_p = Path('{output_dir}/logs/{policy_name}_gpu_mem.csv')
|
||||
if csv_p.exists():
|
||||
for ln in csv_p.read_text().splitlines():
|
||||
parts = ln.strip().split(',')
|
||||
if len(parts) >= 2:
|
||||
try:
|
||||
peak_mem = max(peak_mem, float(parts[1].strip()))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Parse train config for optimizer details
|
||||
lr, opt_wd, sched_type, sched_warmup, sched_decay = 0.0, 0.0, '', 0, 0
|
||||
freeze_ve, train_eo, grad_ckpt = False, False, False
|
||||
cfg_path = Path('{train_dir}/checkpoints/{steps:06d}/pretrained_model/train_config.json')
|
||||
if cfg_path.exists():
|
||||
tc = json.loads(cfg_path.read_text())
|
||||
o = tc.get('optimizer', {{}})
|
||||
lr = o.get('lr', 0.0)
|
||||
opt_wd = o.get('weight_decay', 0.0)
|
||||
s = tc.get('scheduler', {{}})
|
||||
sched_type = s.get('type', '')
|
||||
sched_warmup = s.get('num_warmup_steps', 0)
|
||||
sched_decay = s.get('num_decay_steps', 0)
|
||||
p = tc.get('policy', {{}})
|
||||
freeze_ve = p.get('freeze_vision_encoder', False)
|
||||
train_eo = p.get('train_expert_only', False)
|
||||
grad_ckpt = p.get('gradient_checkpointing', False)
|
||||
|
||||
row = {{
|
||||
'benchmark_uuid': '{benchmark_uuid}',
|
||||
'policy_type': '{policy_name}',
|
||||
'policy_repo_id': '{hub_org}/{policy_name}_libero',
|
||||
'base_model_repo_id': '{cfg.policy_path or ""}',
|
||||
'dataset_repo_id': '{COMMON_TRAINING_ARGS["dataset.repo_id"]}',
|
||||
'env_type': '{COMMON_TRAINING_ARGS["env.type"]}',
|
||||
'env_task': '{COMMON_TRAINING_ARGS["env.task"]}',
|
||||
'steps': {steps},
|
||||
'batch_size_per_gpu': {bs},
|
||||
'num_gpus': {cfg.num_gpus},
|
||||
'effective_batch_size': {eff_bs},
|
||||
'total_samples_seen': {steps * eff_bs},
|
||||
'chunk_size': {cfg.chunk_size or 0},
|
||||
'learning_rate': lr,
|
||||
'optimizer_type': 'AdamW',
|
||||
'optimizer_weight_decay': opt_wd,
|
||||
'scheduler_type': sched_type,
|
||||
'scheduler_warmup_steps': sched_warmup,
|
||||
'scheduler_decay_steps': sched_decay,
|
||||
'freeze_vision_encoder': freeze_ve,
|
||||
'train_expert_only': train_eo,
|
||||
'gradient_checkpointing': grad_ckpt,
|
||||
'eval_success_rate': eval_sr,
|
||||
'eval_success_rate_per_task': eval_per_task,
|
||||
'eval_n_episodes': eval_n,
|
||||
'final_train_loss': final_loss,
|
||||
'training_time_s': float(timing.get('TRAINING_TIME_S', 0)),
|
||||
'peak_gpu_memory_mb': peak_mem or float(timing.get('MAX_GPU_MEM_MB', 0)),
|
||||
'gpu_type': timing.get('GPU_TYPE', 'unknown'),
|
||||
'lerobot_commit': timing.get('LEROBOT_COMMIT', 'unknown'),
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
}}
|
||||
|
||||
# Save locally
|
||||
Path('{train_dir}/benchmark_result.json').write_text(json.dumps(row, indent=2, default=str))
|
||||
|
||||
# Push to HF dataset
|
||||
try:
|
||||
from datasets import Dataset, load_dataset
|
||||
try:
|
||||
existing = load_dataset('{hub_dataset}', split='train')
|
||||
rows = existing.to_list() + [row]
|
||||
except Exception:
|
||||
rows = [row]
|
||||
Dataset.from_list(rows).push_to_hub('{hub_dataset}', split='train')
|
||||
print('Published result to {hub_dataset}')
|
||||
except ImportError:
|
||||
print('datasets library not installed — result saved locally only')
|
||||
except Exception as e:
|
||||
print(f'Failed to push to hub: {{e}} — result saved locally')
|
||||
"
|
||||
""")
|
||||
|
||||
|
||||
def _generate_sbatch_script(
|
||||
policy_name: str,
|
||||
output_dir: Path,
|
||||
hub_org: str,
|
||||
benchmark_uuid: str,
|
||||
hub_dataset: str,
|
||||
lerobot_commit: str,
|
||||
) -> str:
|
||||
cfg = POLICY_CONFIGS[policy_name]
|
||||
steps = int(COMMON_TRAINING_ARGS["steps"])
|
||||
log_dir = output_dir / "logs"
|
||||
train_dir = output_dir / "train" / policy_name
|
||||
checkpoint_path = train_dir / f"checkpoints/{steps:06d}/pretrained_model"
|
||||
|
||||
training_args = _training_cli_args(policy_name, output_dir, hub_org, benchmark_uuid)
|
||||
eval_args = _cli_args(EVAL_ARGS)
|
||||
publish = _publish_snippet(policy_name, output_dir, hub_org, benchmark_uuid, hub_dataset)
|
||||
|
||||
return textwrap.dedent(f"""\
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench_{policy_name}
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gres=gpu:{cfg.num_gpus}
|
||||
#SBATCH --cpus-per-task={cfg.num_gpus * 8}
|
||||
#SBATCH --mem={GPU_MEM_ESTIMATES.get(policy_name, 128)}G
|
||||
#SBATCH --time=06:00:00
|
||||
#SBATCH --output={log_dir}/{policy_name}_%j.out
|
||||
#SBATCH --error={log_dir}/{policy_name}_%j.err
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
echo "=========================================="
|
||||
echo "LeRobot LIBERO Benchmark — {policy_name}"
|
||||
echo "UUID: {benchmark_uuid}"
|
||||
echo "Start: $(date -Iseconds)"
|
||||
echo "Host: $(hostname) | GPUs: {cfg.num_gpus}"
|
||||
echo "=========================================="
|
||||
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
# GPU memory monitoring (every 30s)
|
||||
nvidia-smi --query-gpu=index,memory.used,memory.total,gpu_name \\
|
||||
--format=csv,noheader,nounits -l 30 \\
|
||||
> "{log_dir}/{policy_name}_gpu_mem.csv" &
|
||||
GPU_MONITOR_PID=$!
|
||||
|
||||
# ── Training ──────────────────────────────────────────────────
|
||||
echo "[$(date -Iseconds)] Starting training..."
|
||||
accelerate launch --num_processes={cfg.num_gpus} \\
|
||||
$(which lerobot-train) \\
|
||||
{training_args}
|
||||
TRAIN_EXIT=$?
|
||||
TRAIN_END=$(date +%s)
|
||||
echo "[$(date -Iseconds)] Training exit code: $TRAIN_EXIT"
|
||||
|
||||
# ── Evaluation ────────────────────────────────────────────────
|
||||
EVAL_EXIT=1
|
||||
if [ $TRAIN_EXIT -eq 0 ]; then
|
||||
echo "[$(date -Iseconds)] Starting evaluation..."
|
||||
lerobot-eval \\
|
||||
--policy.path="{checkpoint_path}" \\
|
||||
{eval_args} \\
|
||||
--output_dir="{train_dir}/eval_results"
|
||||
EVAL_EXIT=$?
|
||||
echo "[$(date -Iseconds)] Eval exit code: $EVAL_EXIT"
|
||||
else
|
||||
echo "[$(date -Iseconds)] Skipping eval — training failed."
|
||||
fi
|
||||
|
||||
# ── Timing ────────────────────────────────────────────────────
|
||||
END_TIME=$(date +%s)
|
||||
kill $GPU_MONITOR_PID 2>/dev/null || true
|
||||
|
||||
cat > "{log_dir}/{policy_name}_timing.txt" <<TIMING_EOF
|
||||
BENCHMARK_UUID={benchmark_uuid}
|
||||
POLICY_TYPE={policy_name}
|
||||
TRAINING_TIME_S=$((TRAIN_END - START_TIME))
|
||||
TOTAL_TIME_S=$((END_TIME - START_TIME))
|
||||
TRAIN_EXIT=$TRAIN_EXIT
|
||||
EVAL_EXIT=$EVAL_EXIT
|
||||
MAX_GPU_MEM_MB=$(awk -F',' '{{print $2}}' "{log_dir}/{policy_name}_gpu_mem.csv" 2>/dev/null | sort -n | tail -1)
|
||||
GPU_TYPE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | head -1 | xargs)
|
||||
LEROBOT_COMMIT={lerobot_commit}
|
||||
TIMING_EOF
|
||||
|
||||
# ── Publish result to HF dataset ──────────────────────────────
|
||||
echo "[$(date -Iseconds)] Publishing result..."
|
||||
{publish}
|
||||
|
||||
echo "=========================================="
|
||||
echo "Done: $(date -Iseconds)"
|
||||
echo "Training: $((TRAIN_END - START_TIME))s | Total: $((END_TIME - START_TIME))s"
|
||||
echo "=========================================="
|
||||
""")
|
||||
|
||||
|
||||
def _generate_tokenizer_script(
|
||||
output_dir: Path,
|
||||
hub_org: str,
|
||||
benchmark_uuid: str,
|
||||
) -> str:
|
||||
cfg = POLICY_CONFIGS["pi0_fast"]
|
||||
log_dir = output_dir / "logs"
|
||||
tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
|
||||
|
||||
tok_args = dict(cfg.tokenizer_args)
|
||||
tok_args["hub_repo_id"] = tokenizer_hub_repo
|
||||
|
||||
return textwrap.dedent(f"""\
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench_tokenizer
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --time=01:00:00
|
||||
#SBATCH --output={log_dir}/tokenizer_%j.out
|
||||
#SBATCH --error={log_dir}/tokenizer_%j.err
|
||||
|
||||
set -euo pipefail
|
||||
echo "LeRobot — FAST Tokenizer | UUID: {benchmark_uuid}"
|
||||
|
||||
lerobot-train-tokenizer \\
|
||||
{_cli_args(tok_args)}
|
||||
|
||||
echo "Tokenizer pushed to: {tokenizer_hub_repo}"
|
||||
""")
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Generate SLURM scripts for LeRobot LIBERO benchmark.")
|
||||
parser.add_argument(
|
||||
"--policies",
|
||||
nargs="+",
|
||||
default=ALL_POLICY_NAMES,
|
||||
choices=ALL_POLICY_NAMES,
|
||||
help="Policies to benchmark (default: all).",
|
||||
)
|
||||
parser.add_argument("--output_dir", type=Path, required=True, help="Root output directory.")
|
||||
parser.add_argument("--hub_org", type=str, default="lerobot", help="HuggingFace org.")
|
||||
parser.add_argument("--hub_dataset", type=str, default=None, help="HF dataset repo for results.")
|
||||
parser.add_argument("--uuid", type=str, default=None, help="Override benchmark UUID.")
|
||||
args = parser.parse_args()
|
||||
|
||||
benchmark_uuid = args.uuid or str(uuid.uuid4())
|
||||
output_dir: Path = args.output_dir.resolve()
|
||||
policies: list[str] = args.policies
|
||||
hub_org: str = args.hub_org
|
||||
hub_dataset: str = args.hub_dataset or f"{hub_org}/benchmark-libero"
|
||||
|
||||
try:
|
||||
commit = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
commit = "unknown"
|
||||
|
||||
scripts_dir = output_dir / "slurm_scripts"
|
||||
log_dir = output_dir / "logs"
|
||||
scripts_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
for p in policies:
|
||||
(output_dir / "train" / p).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
generated: dict[str, Path] = {}
|
||||
|
||||
# Tokenizer job for pi0_fast
|
||||
tokenizer_path = None
|
||||
if "pi0_fast" in policies:
|
||||
script = _generate_tokenizer_script(output_dir, hub_org, benchmark_uuid)
|
||||
tokenizer_path = scripts_dir / "00_tokenizer.sh"
|
||||
tokenizer_path.write_text(script)
|
||||
tokenizer_path.chmod(0o755)
|
||||
generated["tokenizer"] = tokenizer_path
|
||||
tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
|
||||
POLICY_CONFIGS["pi0_fast"].extra_policy_args["policy.action_tokenizer_name"] = tokenizer_hub_repo
|
||||
|
||||
# Per-policy scripts
|
||||
for i, name in enumerate(sorted(policies), start=1):
|
||||
script = _generate_sbatch_script(name, output_dir, hub_org, benchmark_uuid, hub_dataset, commit)
|
||||
path = scripts_dir / f"{i:02d}_{name}.sh"
|
||||
path.write_text(script)
|
||||
path.chmod(0o755)
|
||||
generated[name] = path
|
||||
|
||||
# Manifest
|
||||
manifest = {
|
||||
"benchmark_uuid": benchmark_uuid,
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"lerobot_commit": commit,
|
||||
"hub_org": hub_org,
|
||||
"hub_dataset": hub_dataset,
|
||||
"policies": policies,
|
||||
"output_dir": str(output_dir),
|
||||
"scripts": {k: str(v) for k, v in generated.items()},
|
||||
}
|
||||
manifest_path = output_dir / "benchmark_manifest.json"
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2))
|
||||
|
||||
# Instructions
|
||||
print("=" * 60)
|
||||
print("LeRobot LIBERO Benchmark — Scripts Generated")
|
||||
print(f"UUID: {benchmark_uuid}")
|
||||
print(f"Output: {output_dir}")
|
||||
print(f"Results dataset: {hub_dataset}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
for _name, path in sorted(generated.items()):
|
||||
print(f" {path}")
|
||||
print()
|
||||
|
||||
if tokenizer_path:
|
||||
print("IMPORTANT: pi0_fast requires tokenizer training FIRST.")
|
||||
print(f" 1. sbatch {tokenizer_path}")
|
||||
print(" 2. Wait for completion")
|
||||
print(f" 3. sbatch {generated.get('pi0_fast', 'N/A')}")
|
||||
print(" 4. All other policies can run in parallel")
|
||||
else:
|
||||
print("All scripts can be submitted in parallel.")
|
||||
print()
|
||||
print("Each job publishes its result to the HF dataset automatically.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
|
||||
|
||||
|
||||
def load_json_if_exists(path: Path) -> dict[str, Any] | None:
|
||||
if not path.exists():
|
||||
return None
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def find_latest_train_config_path(run_root: Path) -> Path | None:
|
||||
checkpoints_dir = run_root / "train" / "checkpoints"
|
||||
if not checkpoints_dir.exists():
|
||||
return None
|
||||
candidates = sorted(
|
||||
checkpoints_dir.glob("*/pretrained_model/train_config.json"),
|
||||
key=lambda path: path.parts[-3],
|
||||
)
|
||||
return candidates[-1] if candidates else None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--benchmark", required=True)
|
||||
parser.add_argument("--policy", required=True)
|
||||
parser.add_argument("--run_root", required=True, type=Path)
|
||||
parser.add_argument("--results_repo", required=True)
|
||||
parser.add_argument("--git_commit", required=True)
|
||||
parser.add_argument("--num_gpus", required=True, type=int)
|
||||
parser.add_argument("--microbatch_per_gpu", required=True, type=int)
|
||||
parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
|
||||
parser.add_argument("--effective_batch_size", required=True, type=int)
|
||||
parser.add_argument("--train_wall_time_s", required=True, type=float)
|
||||
parser.add_argument("--eval_wall_time_s", required=True, type=float)
|
||||
parser.add_argument("--slurm_job_id", default="")
|
||||
parser.add_argument("--docker_image", required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
|
||||
now = datetime.now(UTC)
|
||||
created_at = now.isoformat()
|
||||
timestamp = utc_timestamp_slug(now)
|
||||
run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
|
||||
eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
|
||||
train_config_path = find_latest_train_config_path(args.run_root)
|
||||
train_config = load_json_if_exists(train_config_path) or {}
|
||||
|
||||
artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
|
||||
row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
|
||||
|
||||
row = {
|
||||
"schema_version": 1,
|
||||
"created_at": created_at,
|
||||
"run_id": run_id,
|
||||
"benchmark": args.benchmark,
|
||||
"policy": args.policy,
|
||||
"git_commit": args.git_commit,
|
||||
"slurm_job_id": args.slurm_job_id or None,
|
||||
"docker_image": args.docker_image,
|
||||
"resources": {
|
||||
"num_gpus": args.num_gpus,
|
||||
"microbatch_per_gpu": args.microbatch_per_gpu,
|
||||
"gradient_accumulation_steps": args.gradient_accumulation_steps,
|
||||
"effective_batch_size": args.effective_batch_size,
|
||||
},
|
||||
"timings": {
|
||||
"train_wall_time_s": args.train_wall_time_s,
|
||||
"eval_wall_time_s": args.eval_wall_time_s,
|
||||
"total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
|
||||
},
|
||||
"eval": {
|
||||
"overall": eval_info.get("overall", {}),
|
||||
"per_group": eval_info.get("per_group", {}),
|
||||
"per_task_count": len(eval_info.get("per_task", [])),
|
||||
},
|
||||
"paths": {
|
||||
"run_root": str(args.run_root),
|
||||
"train_dir": str(args.run_root / "train"),
|
||||
"eval_dir": str(args.run_root / "eval"),
|
||||
},
|
||||
"train_config": train_config,
|
||||
"artifact_urls": {
|
||||
"row": make_hub_file_url(args.results_repo, row_path_in_repo),
|
||||
},
|
||||
}
|
||||
|
||||
row_path = args.run_root / "benchmark_row.json"
|
||||
row_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
|
||||
|
||||
eval_info_path = args.run_root / "eval" / "eval_info.json"
|
||||
if eval_info_path.exists():
|
||||
row["artifact_urls"]["eval_info"] = make_hub_file_url(
|
||||
args.results_repo, f"{artifact_prefix}/eval_info.json"
|
||||
)
|
||||
upload_list.append(
|
||||
UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
|
||||
)
|
||||
|
||||
if train_config_path is not None and train_config_path.exists():
|
||||
row["artifact_urls"]["train_config"] = make_hub_file_url(
|
||||
args.results_repo, f"{artifact_prefix}/train_config.json"
|
||||
)
|
||||
upload_list.append(
|
||||
UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
|
||||
)
|
||||
|
||||
row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
|
||||
return row, upload_list
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
row, upload_list = build_row(args)
|
||||
uploaded = upload_targets(
|
||||
repo_id=args.results_repo,
|
||||
targets=upload_list,
|
||||
repo_type="dataset",
|
||||
private=False,
|
||||
commit_message=f"Add benchmark row {row['run_id']}",
|
||||
)
|
||||
row["uploaded_paths"] = uploaded
|
||||
row_path = args.run_root / "benchmark_row.json"
|
||||
row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
|
||||
print(json.dumps(row, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,647 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import subprocess
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lerobot.utils.history_repo import utc_timestamp_slug
|
||||
|
||||
MAX_GPUS = 8
|
||||
MIN_GPUS = 1
|
||||
DEFAULT_STEPS = 20_000
|
||||
DEFAULT_EFFECTIVE_BATCH_SIZE = 256
|
||||
DEFAULT_MICROBATCH_PER_GPU = 32
|
||||
DEFAULT_EVAL_BATCH_SIZE = 1
|
||||
DEFAULT_CPUS_PER_GPU = 8
|
||||
DEFAULT_MEMORY_PER_GPU_GB = 40
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BenchmarkSpec:
|
||||
name: str
|
||||
dataset_repo_id: str
|
||||
docker_image: str
|
||||
eval_env_type: str
|
||||
eval_task: str
|
||||
eval_n_episodes: int
|
||||
train_steps: int = DEFAULT_STEPS
|
||||
effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
|
||||
train_extra_args: dict[str, Any] = field(default_factory=dict)
|
||||
eval_extra_args: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PolicySpec:
|
||||
name: str
|
||||
policy_type: str
|
||||
num_gpus: int
|
||||
policy_path: str | None = None
|
||||
microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
|
||||
extra_train_args: dict[str, Any] = field(default_factory=dict)
|
||||
extra_eval_args: dict[str, Any] = field(default_factory=dict)
|
||||
needs_tokenizer: bool = False
|
||||
tokenizer_args: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PlannedJob:
|
||||
benchmark: str
|
||||
policy: str
|
||||
run_rel: str
|
||||
num_gpus: int
|
||||
microbatch_per_gpu: int
|
||||
gradient_accumulation_steps: int
|
||||
effective_batch_size: int
|
||||
docker_image: str
|
||||
train_args: dict[str, Any]
|
||||
eval_args: dict[str, Any]
|
||||
tokenizer_args: dict[str, Any] | None
|
||||
script_path: str
|
||||
|
||||
|
||||
BENCHMARKS: dict[str, BenchmarkSpec] = {
|
||||
"libero_plus": BenchmarkSpec(
|
||||
name="libero_plus",
|
||||
dataset_repo_id="lerobot/libero_plus",
|
||||
docker_image="lerobot-benchmark-libero-plus:latest",
|
||||
eval_env_type="libero_plus",
|
||||
eval_task="libero_spatial,libero_object,libero_goal,libero_10",
|
||||
eval_n_episodes=10,
|
||||
train_extra_args={
|
||||
"rename_map": {
|
||||
"observation.images.image": "observation.images.camera1",
|
||||
"observation.images.image2": "observation.images.camera2",
|
||||
},
|
||||
},
|
||||
eval_extra_args={
|
||||
"env.camera_name_mapping": {
|
||||
"agentview_image": "camera1",
|
||||
"robot0_eye_in_hand_image": "camera2",
|
||||
},
|
||||
"env.max_parallel_tasks": 1,
|
||||
"eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
|
||||
"eval.use_async_envs": False,
|
||||
"eval.max_episodes_rendered": 0,
|
||||
"policy.device": "cuda",
|
||||
},
|
||||
),
|
||||
"robomme": BenchmarkSpec(
|
||||
name="robomme",
|
||||
dataset_repo_id="lerobot/robomme",
|
||||
docker_image="lerobot-benchmark-robomme:latest",
|
||||
eval_env_type="robomme",
|
||||
eval_task=(
|
||||
"BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
|
||||
"ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
|
||||
"VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
|
||||
),
|
||||
eval_n_episodes=50,
|
||||
train_extra_args={
|
||||
"rename_map": {
|
||||
"observation.images.image": "observation.images.camera1",
|
||||
"observation.images.wrist_image": "observation.images.camera2",
|
||||
},
|
||||
},
|
||||
eval_extra_args={
|
||||
"env.dataset_split": "test",
|
||||
"env.max_parallel_tasks": 1,
|
||||
"rename_map": {
|
||||
"observation.images.image": "observation.images.camera1",
|
||||
"observation.images.wrist_image": "observation.images.camera2",
|
||||
},
|
||||
"eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
|
||||
"eval.use_async_envs": False,
|
||||
"eval.max_episodes_rendered": 0,
|
||||
"policy.device": "cuda",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
POLICIES: dict[str, PolicySpec] = {
|
||||
"pi0": PolicySpec(
|
||||
name="pi0",
|
||||
policy_type="pi0",
|
||||
policy_path="lerobot/pi0_base",
|
||||
num_gpus=8,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.scheduler_decay_steps": DEFAULT_STEPS,
|
||||
"policy.empty_cameras": 0,
|
||||
},
|
||||
),
|
||||
"pi0_fast": PolicySpec(
|
||||
name="pi0_fast",
|
||||
policy_type="pi0_fast",
|
||||
policy_path="lerobot/pi0fast-base",
|
||||
num_gpus=8,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.scheduler_decay_steps": DEFAULT_STEPS,
|
||||
"policy.empty_cameras": 0,
|
||||
},
|
||||
needs_tokenizer=True,
|
||||
tokenizer_args={
|
||||
"action_horizon": 30,
|
||||
"encoded_dims": "0:7",
|
||||
"normalization_mode": "QUANTILES",
|
||||
"vocab_size": 1024,
|
||||
"scale": 10.0,
|
||||
"push_to_hub": True,
|
||||
},
|
||||
),
|
||||
"pi05": PolicySpec(
|
||||
name="pi05",
|
||||
policy_type="pi05",
|
||||
policy_path="lerobot/pi05_base",
|
||||
num_gpus=8,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.scheduler_decay_steps": DEFAULT_STEPS,
|
||||
"policy.empty_cameras": 0,
|
||||
},
|
||||
),
|
||||
"groot": PolicySpec(
|
||||
name="groot",
|
||||
policy_type="groot",
|
||||
num_gpus=8,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.base_model_path": "nvidia/GR00T-N1.5-3B",
|
||||
"policy.tune_diffusion_model": True,
|
||||
"policy.tune_projector": True,
|
||||
"policy.tune_llm": False,
|
||||
"policy.tune_visual": False,
|
||||
"policy.use_bf16": True,
|
||||
},
|
||||
),
|
||||
"act": PolicySpec(
|
||||
name="act",
|
||||
policy_type="act",
|
||||
num_gpus=1,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
},
|
||||
),
|
||||
"diffusion": PolicySpec(
|
||||
name="diffusion",
|
||||
policy_type="diffusion",
|
||||
num_gpus=1,
|
||||
extra_train_args={
|
||||
"policy.horizon": 32,
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.n_obs_steps": 2,
|
||||
},
|
||||
),
|
||||
"smolvla": PolicySpec(
|
||||
name="smolvla",
|
||||
policy_type="smolvla",
|
||||
policy_path="lerobot/smolvla_base",
|
||||
num_gpus=8,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 30,
|
||||
"policy.load_vlm_weights": True,
|
||||
"policy.freeze_vision_encoder": False,
|
||||
"policy.train_expert_only": False,
|
||||
"policy.scheduler_decay_steps": DEFAULT_STEPS,
|
||||
"policy.empty_cameras": 1,
|
||||
},
|
||||
),
|
||||
"xvla": PolicySpec(
|
||||
name="xvla",
|
||||
policy_type="xvla",
|
||||
policy_path="lerobot/xvla-widowx",
|
||||
num_gpus=4,
|
||||
extra_train_args={
|
||||
"policy.n_action_steps": 32,
|
||||
"policy.scheduler_decay_steps": DEFAULT_STEPS,
|
||||
"policy.empty_cameras": 1,
|
||||
},
|
||||
),
|
||||
"multi_task_dit": PolicySpec(
|
||||
name="multi_task_dit",
|
||||
policy_type="multi_task_dit",
|
||||
num_gpus=1,
|
||||
extra_train_args={
|
||||
"policy.horizon": 32,
|
||||
"policy.n_action_steps": 30,
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
|
||||
return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
|
||||
|
||||
|
||||
def get_requested_names(
|
||||
requested: list[str] | None,
|
||||
available: dict[str, Any],
|
||||
*,
|
||||
kind: str,
|
||||
) -> list[str]:
|
||||
if not requested:
|
||||
return list(available)
|
||||
unknown = sorted(set(requested) - set(available))
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
|
||||
return requested
|
||||
|
||||
|
||||
def compute_gradient_accumulation_steps(
|
||||
*,
|
||||
effective_batch_size: int,
|
||||
num_gpus: int,
|
||||
microbatch_per_gpu: int,
|
||||
) -> int:
|
||||
per_step_batch = num_gpus * microbatch_per_gpu
|
||||
if effective_batch_size % per_step_batch != 0:
|
||||
raise ValueError(
|
||||
f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
|
||||
f"{microbatch_per_gpu=}."
|
||||
)
|
||||
return effective_batch_size // per_step_batch
|
||||
|
||||
|
||||
def make_run_slug() -> str:
|
||||
return utc_timestamp_slug()
|
||||
|
||||
|
||||
def shell_value(value: Any) -> str:
|
||||
if isinstance(value, bool):
|
||||
value = "true" if value else "false"
|
||||
elif isinstance(value, (dict, list)):
|
||||
value = json.dumps(value, sort_keys=True)
|
||||
else:
|
||||
value = str(value)
|
||||
escaped = (
|
||||
value.replace("\\", "\\\\")
|
||||
.replace('"', '\\"')
|
||||
.replace("$", "\\$")
|
||||
.replace("`", "\\`")
|
||||
)
|
||||
return f'"{escaped}"'
|
||||
|
||||
|
||||
def format_cli_args(args: dict[str, Any]) -> str:
|
||||
lines = []
|
||||
for key, value in args.items():
|
||||
lines.append(f" --{key}={shell_value(value)}")
|
||||
return " \\\n".join(lines)
|
||||
|
||||
|
||||
def build_train_args(
|
||||
*,
|
||||
benchmark: BenchmarkSpec,
|
||||
policy: PolicySpec,
|
||||
train_dir: str,
|
||||
gradient_accumulation_steps: int,
|
||||
) -> dict[str, Any]:
|
||||
args: dict[str, Any] = {
|
||||
"dataset.repo_id": benchmark.dataset_repo_id,
|
||||
"output_dir": train_dir,
|
||||
"steps": benchmark.train_steps,
|
||||
"batch_size": policy.microbatch_per_gpu,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"eval_freq": 0,
|
||||
"save_freq": benchmark.train_steps,
|
||||
"save_checkpoint": True,
|
||||
"log_freq": 100,
|
||||
"wandb.enable": False,
|
||||
"policy.push_to_hub": False,
|
||||
"policy.device": "cuda",
|
||||
}
|
||||
if policy.policy_path:
|
||||
args["policy.path"] = policy.policy_path
|
||||
else:
|
||||
args["policy.type"] = policy.policy_type
|
||||
args.update(benchmark.train_extra_args)
|
||||
args.update(policy.extra_train_args)
|
||||
return args
|
||||
|
||||
|
||||
def build_eval_args(
|
||||
*,
|
||||
benchmark: BenchmarkSpec,
|
||||
policy: PolicySpec,
|
||||
checkpoint_path: str,
|
||||
eval_dir: str,
|
||||
) -> dict[str, Any]:
|
||||
args: dict[str, Any] = {
|
||||
"policy.path": checkpoint_path,
|
||||
"env.type": benchmark.eval_env_type,
|
||||
"env.task": benchmark.eval_task,
|
||||
"eval.n_episodes": benchmark.eval_n_episodes,
|
||||
"output_dir": eval_dir,
|
||||
}
|
||||
args.update(benchmark.eval_extra_args)
|
||||
args.update(policy.extra_eval_args)
|
||||
return args
|
||||
|
||||
|
||||
def plan_jobs(
|
||||
*,
|
||||
output_dir: Path,
|
||||
hub_org: str,
|
||||
results_repo: str,
|
||||
policies: list[str],
|
||||
benchmarks: list[str],
|
||||
) -> list[PlannedJob]:
|
||||
_ = hub_org
|
||||
_ = results_repo
|
||||
scripts_dir = output_dir / "slurm"
|
||||
jobs: list[PlannedJob] = []
|
||||
for benchmark_name in benchmarks:
|
||||
benchmark = BENCHMARKS[benchmark_name]
|
||||
for policy_name in policies:
|
||||
policy = POLICIES[policy_name]
|
||||
num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
|
||||
run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
|
||||
run_root = f"/benchmark-output/{run_rel}"
|
||||
gradient_accumulation_steps = compute_gradient_accumulation_steps(
|
||||
effective_batch_size=benchmark.effective_batch_size,
|
||||
num_gpus=num_gpus,
|
||||
microbatch_per_gpu=policy.microbatch_per_gpu,
|
||||
)
|
||||
train_dir = f"{run_root}/train"
|
||||
checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
|
||||
eval_dir = f"{run_root}/eval"
|
||||
train_args = build_train_args(
|
||||
benchmark=benchmark,
|
||||
policy=policy,
|
||||
train_dir=train_dir,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
eval_args = build_eval_args(
|
||||
benchmark=benchmark,
|
||||
policy=policy,
|
||||
checkpoint_path=checkpoint_path,
|
||||
eval_dir=eval_dir,
|
||||
)
|
||||
tokenizer_args = None
|
||||
if policy.needs_tokenizer:
|
||||
tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
|
||||
tokenizer_args = {
|
||||
"repo_id": benchmark.dataset_repo_id,
|
||||
"output_dir": f"{run_root}/tokenizer",
|
||||
"hub_repo_id": tokenizer_repo_id,
|
||||
**policy.tokenizer_args,
|
||||
}
|
||||
train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
|
||||
script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
|
||||
jobs.append(
|
||||
PlannedJob(
|
||||
benchmark=benchmark_name,
|
||||
policy=policy_name,
|
||||
run_rel=run_rel,
|
||||
num_gpus=num_gpus,
|
||||
microbatch_per_gpu=policy.microbatch_per_gpu,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
effective_batch_size=benchmark.effective_batch_size,
|
||||
docker_image=benchmark.docker_image,
|
||||
train_args=train_args,
|
||||
eval_args=eval_args,
|
||||
tokenizer_args=tokenizer_args,
|
||||
script_path=script_path,
|
||||
)
|
||||
)
|
||||
return jobs
|
||||
|
||||
|
||||
def render_sbatch_script(
|
||||
*,
|
||||
job: PlannedJob,
|
||||
output_dir: Path,
|
||||
results_repo_id: str,
|
||||
git_commit: str,
|
||||
) -> str:
|
||||
host_output_dir = output_dir.resolve()
|
||||
run_root = f"/benchmark-output/{job.run_rel}"
|
||||
host_run_root = host_output_dir / job.run_rel
|
||||
cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
|
||||
mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
|
||||
gpu_ids_expr = "${GPU_IDS}"
|
||||
train_cli = format_cli_args(job.train_args)
|
||||
eval_cli = format_cli_args(job.eval_args)
|
||||
tokenizer_command = ""
|
||||
if job.tokenizer_args:
|
||||
tokenizer_cli = format_cli_args(job.tokenizer_args)
|
||||
tokenizer_command = f"""
|
||||
docker run --rm --gpus all \\
|
||||
--shm-size=16g \\
|
||||
-e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
|
||||
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_HOME=/tmp/hf \\
|
||||
-v "{host_output_dir}:/benchmark-output" \\
|
||||
-w /lerobot \\
|
||||
"{job.docker_image}" \\
|
||||
bash -lc '
|
||||
set -euo pipefail
|
||||
if [[ -n "${{HF_TOKEN:-}}" ]]; then
|
||||
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
|
||||
fi
|
||||
lerobot-train-tokenizer \\
|
||||
{tokenizer_cli}
|
||||
'
|
||||
"""
|
||||
return f"""#!/bin/bash
|
||||
#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
|
||||
#SBATCH --gres=gpu:{job.num_gpus}
|
||||
#SBATCH --cpus-per-task={cpus_per_task}
|
||||
#SBATCH --mem={mem_gb}G
|
||||
#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
|
||||
#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
|
||||
GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
|
||||
RUN_ROOT="{run_root}"
|
||||
|
||||
mkdir -p "{host_output_dir}/logs"
|
||||
mkdir -p "{host_run_root.parent}"
|
||||
|
||||
{tokenizer_command}
|
||||
|
||||
TRAIN_START="$(date +%s)"
|
||||
docker run --rm --gpus all \\
|
||||
--shm-size=16g \\
|
||||
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
|
||||
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_HOME=/tmp/hf \\
|
||||
-v "{host_output_dir}:/benchmark-output" \\
|
||||
-w /lerobot \\
|
||||
"{job.docker_image}" \\
|
||||
bash -lc '
|
||||
set -euo pipefail
|
||||
if [[ -n "${{HF_TOKEN:-}}" ]]; then
|
||||
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
|
||||
fi
|
||||
accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
|
||||
{train_cli}
|
||||
'
|
||||
TRAIN_END="$(date +%s)"
|
||||
|
||||
EVAL_START="$(date +%s)"
|
||||
docker run --rm --gpus all \\
|
||||
--shm-size=16g \\
|
||||
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
|
||||
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_HOME=/tmp/hf \\
|
||||
-v "{host_output_dir}:/benchmark-output" \\
|
||||
-w /lerobot \\
|
||||
"{job.docker_image}" \\
|
||||
bash -lc '
|
||||
set -euo pipefail
|
||||
if [[ -n "${{HF_TOKEN:-}}" ]]; then
|
||||
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
|
||||
fi
|
||||
lerobot-eval \\
|
||||
{eval_cli}
|
||||
'
|
||||
EVAL_END="$(date +%s)"
|
||||
TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
|
||||
EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
|
||||
|
||||
docker run --rm --gpus all \\
|
||||
--shm-size=16g \\
|
||||
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
|
||||
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
|
||||
-e HF_HOME=/tmp/hf \\
|
||||
-e RUN_ROOT="${{RUN_ROOT}}" \\
|
||||
-e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
|
||||
-e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
|
||||
-v "{host_output_dir}:/benchmark-output" \\
|
||||
-w /lerobot \\
|
||||
"{job.docker_image}" \\
|
||||
bash -lc '
|
||||
set -euo pipefail
|
||||
if [[ -n "${{HF_TOKEN:-}}" ]]; then
|
||||
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
|
||||
fi
|
||||
uv run python benchmarks/publish_benchmark_result.py \\
|
||||
--benchmark={job.benchmark} \\
|
||||
--policy={job.policy} \\
|
||||
--run_root="${{RUN_ROOT}}" \\
|
||||
--results_repo={results_repo_id} \\
|
||||
--git_commit={git_commit} \\
|
||||
--num_gpus={job.num_gpus} \\
|
||||
--microbatch_per_gpu={job.microbatch_per_gpu} \\
|
||||
--gradient_accumulation_steps={job.gradient_accumulation_steps} \\
|
||||
--effective_batch_size={job.effective_batch_size} \\
|
||||
--train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
|
||||
--eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
|
||||
--slurm_job_id="${{SLURM_JOB_ID:-}}" \\
|
||||
--docker_image={job.docker_image}
|
||||
'
|
||||
"""
|
||||
|
||||
|
||||
def write_manifest(
|
||||
*,
|
||||
output_dir: Path,
|
||||
jobs: list[PlannedJob],
|
||||
git_commit: str,
|
||||
hub_org: str,
|
||||
results_repo: str,
|
||||
) -> Path:
|
||||
manifest = {
|
||||
"generated_at": datetime.now(UTC).isoformat(),
|
||||
"git_commit": git_commit,
|
||||
"hub_org": hub_org,
|
||||
"results_repo": results_repo,
|
||||
"jobs": [asdict(job) for job in jobs],
|
||||
}
|
||||
manifest_path = output_dir / "manifest.json"
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
|
||||
return manifest_path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--policies", nargs="*", default=None)
|
||||
parser.add_argument("--benchmarks", nargs="*", default=None)
|
||||
parser.add_argument("--output_dir", required=True, type=Path)
|
||||
parser.add_argument("--hub_org", required=True)
|
||||
parser.add_argument("--results_repo", required=True)
|
||||
parser.add_argument("--submit", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_git_commit() -> str:
|
||||
return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
|
||||
(args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
|
||||
selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
|
||||
git_commit = get_git_commit()
|
||||
results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
|
||||
|
||||
jobs = plan_jobs(
|
||||
output_dir=args.output_dir,
|
||||
hub_org=args.hub_org,
|
||||
results_repo=results_repo_id,
|
||||
policies=selected_policies,
|
||||
benchmarks=selected_benchmarks,
|
||||
)
|
||||
|
||||
for job in jobs:
|
||||
script = render_sbatch_script(
|
||||
job=job,
|
||||
output_dir=args.output_dir,
|
||||
results_repo_id=results_repo_id,
|
||||
git_commit=git_commit,
|
||||
)
|
||||
script_path = Path(job.script_path)
|
||||
script_path.write_text(script)
|
||||
script_path.chmod(0o755)
|
||||
if args.submit:
|
||||
subprocess.run(["sbatch", str(script_path)], check=True)
|
||||
|
||||
manifest_path = write_manifest(
|
||||
output_dir=args.output_dir,
|
||||
jobs=jobs,
|
||||
git_commit=git_commit,
|
||||
hub_org=args.hub_org,
|
||||
results_repo=results_repo_id,
|
||||
)
|
||||
print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
|
||||
print(f"Manifest: {manifest_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,48 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
RUN uv pip install --no-cache \
|
||||
"robosuite==1.4.1" bddl easydict mujoco matplotlib wand scikit-image gym
|
||||
|
||||
ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
|
||||
RUN git clone --depth=1 https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
|
||||
&& cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
|
||||
&& uv pip uninstall hf-libero 2>/dev/null || true
|
||||
ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
|
||||
|
||||
RUN python -c "\
|
||||
from huggingface_hub import hf_hub_download; \
|
||||
hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
|
||||
filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
|
||||
&& unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
|
||||
&& mv /tmp/libero-plus-dl/extract/inspire/hdd/project/embodied-multimodality/public/syfei/libero_new/release/dataset/LIBERO-plus-0/assets \
|
||||
${LIBERO_PLUS_ROOT}/assets \
|
||||
&& rm -rf /tmp/libero-plus-dl
|
||||
|
||||
RUN mkdir -p /home/user_lerobot/.libero \
|
||||
&& printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
|
||||
> /home/user_lerobot/.libero/config.yaml
|
||||
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -0,0 +1,39 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
|
||||
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
libvulkan1 libvulkan-dev mesa-vulkan-drivers \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
|
||||
&& uv pip install --no-cache --override /tmp/robomme_override.txt \
|
||||
-e ".[smolvla,av-dep]" \
|
||||
"robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
|
||||
&& python -c "import robomme; print('robomme import OK')"
|
||||
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -31,10 +31,22 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# LIBERO-plus derives task.language by space-joining the perturbation-variant
|
||||
# filename, so strip the perturbation metadata blob to recover the base prompt.
|
||||
_LIBERO_PERTURBATION_TAIL_RE = re.compile(
|
||||
r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
|
||||
)
|
||||
|
||||
|
||||
def _strip_libero_perturbation_tail(instruction: str) -> str:
|
||||
return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
|
||||
|
||||
|
||||
def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
from libero.libero import benchmark # type: ignore[import-untyped]
|
||||
|
||||
@@ -47,7 +59,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
)
|
||||
return {}
|
||||
suite = suite_dict[task_suite]()
|
||||
return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
|
||||
return {
|
||||
f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
|
||||
for i in range(suite.n_tasks)
|
||||
}
|
||||
|
||||
|
||||
def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
@@ -57,6 +72,14 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
return {f"{task_name}_0": label}
|
||||
|
||||
|
||||
def _robomme_descriptions(task_names: str) -> dict[str, str]:
|
||||
return {
|
||||
f"{task_name}_0": task_name.replace("_", " ").strip()
|
||||
for task_name in (task.strip() for task in task_names.split(","))
|
||||
if task_name
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
|
||||
@@ -66,10 +89,12 @@ def main() -> int:
|
||||
|
||||
descriptions: dict[str, str] = {}
|
||||
try:
|
||||
if args.env == "libero":
|
||||
if args.env in {"libero", "libero_plus"}:
|
||||
descriptions = _libero_descriptions(args.task)
|
||||
elif args.env == "metaworld":
|
||||
descriptions = _metaworld_descriptions(args.task)
|
||||
elif args.env == "robomme":
|
||||
descriptions = _robomme_descriptions(args.task)
|
||||
else:
|
||||
print(
|
||||
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
title: LeRobot Benchmark Leaderboard
|
||||
emoji: 🤖
|
||||
colorFrom: yellow
|
||||
colorTo: orange
|
||||
sdk: gradio
|
||||
sdk_version: 5.29.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: apache-2.0
|
||||
short_description: Benchmark history for LeRobot policy x benchmark runs
|
||||
---
|
||||
|
||||
# LeRobot Benchmark Leaderboard
|
||||
|
||||
This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
|
||||
|
||||
- Latest result per policy and benchmark
|
||||
- Historical trends over time
|
||||
- Direct links to uploaded eval and config artifacts
|
||||
|
||||
## Configuration
|
||||
|
||||
Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
|
||||
at a different public dataset. The default is:
|
||||
|
||||
- `lerobot/benchmark-history`
|
||||
@@ -0,0 +1,226 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import gradio as gr
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
|
||||
RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
|
||||
CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
|
||||
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CACHE_TTL_S = 300
|
||||
|
||||
_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
|
||||
|
||||
|
||||
def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
|
||||
overall = row.get("eval", {}).get("overall", {})
|
||||
resources = row.get("resources", {})
|
||||
timings = row.get("timings", {})
|
||||
artifact_urls = row.get("artifact_urls", {})
|
||||
return {
|
||||
"created_at": row.get("created_at"),
|
||||
"benchmark": row.get("benchmark"),
|
||||
"policy": row.get("policy"),
|
||||
"success_rate": overall.get("pc_success"),
|
||||
"n_episodes": overall.get("n_episodes"),
|
||||
"avg_sum_reward": overall.get("avg_sum_reward"),
|
||||
"train_wall_time_s": timings.get("train_wall_time_s"),
|
||||
"eval_wall_time_s": timings.get("eval_wall_time_s"),
|
||||
"total_wall_time_s": timings.get("total_wall_time_s"),
|
||||
"num_gpus": resources.get("num_gpus"),
|
||||
"microbatch_per_gpu": resources.get("microbatch_per_gpu"),
|
||||
"gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
|
||||
"effective_batch_size": resources.get("effective_batch_size"),
|
||||
"git_commit": row.get("git_commit"),
|
||||
"row_url": artifact_urls.get("row"),
|
||||
"eval_info_url": artifact_urls.get("eval_info"),
|
||||
"train_config_url": artifact_urls.get("train_config"),
|
||||
}
|
||||
|
||||
|
||||
def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
|
||||
cache_key = f"rows::{repo_id}"
|
||||
cached = _CACHE.get(cache_key)
|
||||
if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
|
||||
return cached[1]
|
||||
|
||||
api = HfApi()
|
||||
files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
|
||||
records: list[dict[str, Any]] = []
|
||||
for path_in_repo in sorted(files, reverse=True):
|
||||
local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
|
||||
with open(local_path) as f:
|
||||
row = json.load(f)
|
||||
records.append(_row_to_record(row))
|
||||
|
||||
df = pd.DataFrame.from_records(records)
|
||||
if not df.empty:
|
||||
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
|
||||
df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
|
||||
_CACHE[cache_key] = (time.monotonic(), df)
|
||||
return df
|
||||
|
||||
|
||||
def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
|
||||
if df.empty:
|
||||
return df
|
||||
latest = (
|
||||
df.sort_values("created_at", ascending=False)
|
||||
.groupby(["benchmark", "policy"], as_index=False)
|
||||
.first()
|
||||
.sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
|
||||
)
|
||||
return latest[
|
||||
[
|
||||
"benchmark",
|
||||
"policy",
|
||||
"success_rate",
|
||||
"n_episodes",
|
||||
"train_wall_time_s",
|
||||
"eval_wall_time_s",
|
||||
"num_gpus",
|
||||
"effective_batch_size",
|
||||
"git_commit",
|
||||
"row_url",
|
||||
"eval_info_url",
|
||||
"train_config_url",
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
|
||||
filtered = df[df["benchmark"] == benchmark]
|
||||
if policy and policy != "All":
|
||||
filtered = filtered[filtered["policy"] == policy]
|
||||
if filtered.empty:
|
||||
return px.line(title="No benchmark rows found")
|
||||
fig = px.line(
|
||||
filtered.sort_values("created_at"),
|
||||
x="created_at",
|
||||
y="success_rate",
|
||||
color="policy",
|
||||
markers=True,
|
||||
hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
|
||||
title=f"{benchmark} success rate history",
|
||||
)
|
||||
fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
|
||||
return fig
|
||||
|
||||
|
||||
def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
|
||||
filtered = df[df["benchmark"] == benchmark]
|
||||
if policy and policy != "All":
|
||||
filtered = filtered[filtered["policy"] == policy]
|
||||
if filtered.empty:
|
||||
return "No matching runs yet."
|
||||
latest = filtered.sort_values("created_at", ascending=False).iloc[0]
|
||||
row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
|
||||
eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
|
||||
train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
|
||||
lines = [
|
||||
f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
|
||||
f"Success rate: `{latest['success_rate']}`",
|
||||
f"GPUs: `{latest['num_gpus']}`",
|
||||
f"Effective batch size: `{latest['effective_batch_size']}`",
|
||||
f"Commit: `{latest['git_commit']}`",
|
||||
]
|
||||
if row_link:
|
||||
lines.append(f"Row JSON: [open]({row_link})")
|
||||
if eval_link:
|
||||
lines.append(f"Eval Info: [open]({eval_link})")
|
||||
if train_link:
|
||||
lines.append(f"Train Config: [open]({train_link})")
|
||||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
|
||||
df = load_rows()
|
||||
latest_table = make_latest_table(df)
|
||||
benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
||||
if benchmark not in benchmark_names and benchmark_names:
|
||||
benchmark = benchmark_names[0]
|
||||
policy_choices = ["All"]
|
||||
if benchmark and not df.empty:
|
||||
policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
|
||||
if policy not in policy_choices:
|
||||
policy = "All"
|
||||
history = make_history_figure(df, benchmark, policy)
|
||||
summary = make_run_markdown(df, benchmark, policy)
|
||||
return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
|
||||
|
||||
|
||||
with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
|
||||
gr.Markdown(
|
||||
f"""
|
||||
# LeRobot Benchmark Leaderboard
|
||||
|
||||
Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
|
||||
policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
|
||||
refresh_button = gr.Button("Refresh")
|
||||
|
||||
latest_table = gr.Dataframe(label="Latest Results", interactive=False)
|
||||
history_plot = gr.Plot(label="History")
|
||||
latest_summary = gr.Markdown()
|
||||
|
||||
def _initial_state():
|
||||
df = load_rows()
|
||||
benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
||||
benchmark = benchmarks[0] if benchmarks else ""
|
||||
latest, policy_choices, history, summary = refresh_view(benchmark, "All")
|
||||
return (
|
||||
gr.update(choices=benchmarks, value=benchmark),
|
||||
policy_choices,
|
||||
latest,
|
||||
history,
|
||||
summary,
|
||||
)
|
||||
|
||||
demo.load(
|
||||
_initial_state,
|
||||
outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
|
||||
)
|
||||
refresh_button.click(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
benchmark_dropdown.change(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
policy_dropdown.change(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
@@ -0,0 +1,4 @@
|
||||
gradio>=5.0.0,<6.0.0
|
||||
plotly>=5.18.0
|
||||
pandas>=2.0.0
|
||||
huggingface-hub>=1.0.0,<2.0.0
|
||||
@@ -67,11 +67,17 @@ class EvalConfig:
|
||||
# `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
|
||||
# Set to 0 for auto-tuning based on available CPU cores and n_episodes.
|
||||
batch_size: int = 0
|
||||
# Number of rollout videos to save per evaluated task. Set to 0 to disable videos.
|
||||
max_episodes_rendered: int = 10
|
||||
# `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
|
||||
# Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1.
|
||||
use_async_envs: bool = True
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.max_episodes_rendered < 0:
|
||||
raise ValueError(
|
||||
f"`max_episodes_rendered` must be non-negative, got {self.max_episodes_rendered}."
|
||||
)
|
||||
if self.batch_size == 0:
|
||||
self.batch_size = self._auto_batch_size()
|
||||
if self.batch_size > self.n_episodes:
|
||||
|
||||
@@ -56,6 +56,7 @@ class TrainPipelineConfig(HubMixin):
|
||||
# Number of workers for the dataloader.
|
||||
num_workers: int = 4
|
||||
batch_size: int = 8
|
||||
gradient_accumulation_steps: int = 1
|
||||
steps: int = 100_000
|
||||
eval_freq: int = 20_000
|
||||
log_freq: int = 200
|
||||
@@ -132,6 +133,11 @@ class TrainPipelineConfig(HubMixin):
|
||||
if isinstance(self.dataset.repo_id, list):
|
||||
raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")
|
||||
|
||||
if self.gradient_accumulation_steps <= 0:
|
||||
raise ValueError(
|
||||
f"`gradient_accumulation_steps` must be strictly positive, got {self.gradient_accumulation_steps}."
|
||||
)
|
||||
|
||||
if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
|
||||
raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
|
||||
elif self.use_policy_training_preset and not self.resume:
|
||||
|
||||
@@ -18,7 +18,15 @@
|
||||
# from lerobot.utils.import_utils import require_package
|
||||
# require_package("gymnasium", extra="<update_extra>", import_name="gymnasium")
|
||||
|
||||
from .configs import AlohaEnv, EnvConfig, HILSerlRobotEnvConfig, HubEnvConfig, PushtEnv
|
||||
from .configs import (
|
||||
AlohaEnv,
|
||||
EnvConfig,
|
||||
HILSerlRobotEnvConfig,
|
||||
HubEnvConfig,
|
||||
LiberoPlusEnv,
|
||||
PushtEnv,
|
||||
RoboMMEEnv,
|
||||
)
|
||||
from .factory import make_env, make_env_config, make_env_pre_post_processors
|
||||
from .utils import check_env_attributes_and_types, close_envs, env_to_policy_features, preprocess_observation
|
||||
|
||||
@@ -27,7 +35,9 @@ __all__ = [
|
||||
"EnvConfig",
|
||||
"HILSerlRobotEnvConfig",
|
||||
"HubEnvConfig",
|
||||
"LiberoPlusEnv",
|
||||
"PushtEnv",
|
||||
"RoboMMEEnv",
|
||||
"check_env_attributes_and_types",
|
||||
"close_envs",
|
||||
"env_to_policy_features",
|
||||
|
||||
@@ -574,3 +574,58 @@ class IsaaclabArenaEnv(HubEnvConfig):
|
||||
),
|
||||
PolicyProcessorPipeline(steps=[]),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("libero_plus")
|
||||
@dataclass
|
||||
class LiberoPlusEnv(LiberoEnv):
|
||||
"""Config for LIBERO-plus robustness benchmark evaluation."""
|
||||
|
||||
task: str = "libero_spatial"
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robomme")
|
||||
@dataclass
|
||||
class RoboMMEEnv(EnvConfig):
|
||||
"""RoboMME memory-augmented manipulation benchmark."""
|
||||
|
||||
task: str = "PickXtimes"
|
||||
fps: int = 10
|
||||
episode_length: int = 300
|
||||
action_space: str = "joint_angle"
|
||||
dataset_split: str = "test"
|
||||
task_ids: list[int] | None = None
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)),
|
||||
"image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
"wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"image": f"{OBS_IMAGES}.image",
|
||||
"wrist_image": f"{OBS_IMAGES}.wrist_image",
|
||||
OBS_STATE: OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from .robomme import create_robomme_envs
|
||||
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_robomme_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
action_space_type=self.action_space,
|
||||
dataset=self.dataset_split,
|
||||
episode_length=self.episode_length,
|
||||
task_ids=self.task_ids,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from functools import partial
|
||||
@@ -69,14 +70,28 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
|
||||
return ids
|
||||
|
||||
|
||||
# LIBERO-plus perturbation variants encode the perturbation in the filename
|
||||
# but on disk only the base `.pruned_init` exists — strip the suffix to match
|
||||
# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
|
||||
# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
|
||||
_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
|
||||
|
||||
|
||||
def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
|
||||
init_states_path = (
|
||||
Path(get_libero_path("init_states"))
|
||||
/ task_suite.tasks[i].problem_folder
|
||||
/ task_suite.tasks[i].init_states_file
|
||||
)
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states
|
||||
task = task_suite.tasks[i]
|
||||
filename = Path(task.init_states_file)
|
||||
root = Path(get_libero_path("init_states"))
|
||||
|
||||
# `_add_` / `_level` variants store extra-object layouts under libero_newobj/
|
||||
# as a flat array that must be reshaped to (1, -1).
|
||||
if "_add_" in filename.name or "_level" in filename.name:
|
||||
init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states.reshape(1, -1)
|
||||
|
||||
stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
|
||||
init_states_path = root / task.problem_folder / stripped
|
||||
return torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
|
||||
|
||||
def get_libero_dummy_action():
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""RoboMME environment wrapper for LeRobot evaluation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
ROBOMME_TASKS = [
|
||||
"BinFill",
|
||||
"PickXtimes",
|
||||
"SwingXtimes",
|
||||
"StopCube",
|
||||
"VideoUnmask",
|
||||
"VideoUnmaskSwap",
|
||||
"ButtonUnmask",
|
||||
"ButtonUnmaskSwap",
|
||||
"PickHighlight",
|
||||
"VideoRepick",
|
||||
"VideoPlaceButton",
|
||||
"VideoPlaceOrder",
|
||||
"MoveCube",
|
||||
"InsertPeg",
|
||||
"PatternLock",
|
||||
"RouteStick",
|
||||
]
|
||||
|
||||
|
||||
class RoboMMEGymEnv(gym.Env):
|
||||
"""Thin Gymnasium wrapper around a single RoboMME episode env."""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str = "PickXtimes",
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_idx: int = 0,
|
||||
max_steps: int = 300,
|
||||
):
|
||||
super().__init__()
|
||||
from robomme.env_record_wrapper import BenchmarkEnvBuilder
|
||||
|
||||
self._builder = BenchmarkEnvBuilder(
|
||||
env_id=task,
|
||||
dataset=dataset,
|
||||
action_space=action_space_type,
|
||||
gui_render=False,
|
||||
max_steps=max_steps,
|
||||
)
|
||||
self._max_episode_steps = max_steps
|
||||
self._episode_idx = episode_idx
|
||||
self._max_steps = max_steps
|
||||
self._env = None
|
||||
self._last_raw_obs: dict | None = None
|
||||
|
||||
action_dim = 8 if action_space_type == "joint_angle" else 7
|
||||
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
"state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
|
||||
}
|
||||
)
|
||||
|
||||
def reset(self, *, seed=None, options=None):
|
||||
super().reset(seed=seed)
|
||||
self._env = self._builder.make_env_for_episode(
|
||||
episode_idx=self._episode_idx,
|
||||
max_steps=self._max_steps,
|
||||
)
|
||||
obs, info = self._env.reset()
|
||||
self._last_raw_obs = obs
|
||||
return self._convert_obs(obs), self._convert_info(info)
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, terminated, truncated, info = self._env.step(action)
|
||||
self._last_raw_obs = obs
|
||||
|
||||
terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
|
||||
truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
|
||||
|
||||
status = info.get("status", "ongoing")
|
||||
conv_info = self._convert_info(info)
|
||||
conv_info["is_success"] = status == "success"
|
||||
|
||||
return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
|
||||
|
||||
def render(self) -> np.ndarray | None:
|
||||
if self._last_raw_obs is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
front = self._last_raw_obs.get("front_rgb_list")
|
||||
if front is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
frame = front[-1] if isinstance(front, list) else front
|
||||
return np.asarray(frame, dtype=np.uint8)
|
||||
|
||||
def _convert_obs(self, obs: dict) -> dict:
|
||||
front_rgb = (
|
||||
obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
|
||||
)
|
||||
wrist_rgb = (
|
||||
obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
|
||||
)
|
||||
joint_state = (
|
||||
obs["joint_state_list"][-1]
|
||||
if isinstance(obs["joint_state_list"], list)
|
||||
else obs["joint_state_list"]
|
||||
)
|
||||
gripper_state = (
|
||||
obs["gripper_state_list"][-1]
|
||||
if isinstance(obs["gripper_state_list"], list)
|
||||
else obs["gripper_state_list"]
|
||||
)
|
||||
|
||||
joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
|
||||
gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
|
||||
state = np.concatenate([joint, gripper])
|
||||
|
||||
return {
|
||||
"image": np.asarray(front_rgb, dtype=np.uint8),
|
||||
"wrist_image": np.asarray(wrist_rgb, dtype=np.uint8),
|
||||
"state": state,
|
||||
}
|
||||
|
||||
def _convert_info(self, info: dict) -> dict:
|
||||
return {
|
||||
"status": info.get("status", "ongoing"),
|
||||
"task_goal": info.get("task_goal", ""),
|
||||
}
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task: str,
|
||||
n_envs: int,
|
||||
action_space_type: str,
|
||||
dataset: str,
|
||||
episode_length: int,
|
||||
task_id: int,
|
||||
) -> list[Callable[[], RoboMMEGymEnv]]:
|
||||
def _make_one(episode_index: int) -> RoboMMEGymEnv:
|
||||
return RoboMMEGymEnv(
|
||||
task=task,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_idx=episode_index,
|
||||
max_steps=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, task_id + i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robomme_envs(
|
||||
task: str,
|
||||
n_envs: int = 1,
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_length: int = 300,
|
||||
task_ids: list[int] | None = None,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
) -> dict[str, dict[int, gym.vector.VectorEnv]]:
|
||||
"""Create vectorized RoboMME environments for evaluation."""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
if task_ids is None:
|
||||
task_ids = [0]
|
||||
|
||||
task_names = [t.strip() for t in task.split(",") if t.strip()]
|
||||
out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
|
||||
for task_name in task_names:
|
||||
envs_by_task: dict[int, gym.vector.VectorEnv] = {}
|
||||
for task_id in task_ids:
|
||||
fns = _make_env_fns(
|
||||
task=task_name,
|
||||
n_envs=n_envs,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_length=episode_length,
|
||||
task_id=task_id,
|
||||
)
|
||||
envs_by_task[task_id] = env_cls(fns)
|
||||
out[task_name] = envs_by_task
|
||||
return out
|
||||
@@ -572,7 +572,7 @@ def eval_main(cfg: EvalPipelineConfig):
|
||||
preprocessor=preprocessor,
|
||||
postprocessor=postprocessor,
|
||||
n_episodes=cfg.eval.n_episodes,
|
||||
max_episodes_rendered=10,
|
||||
max_episodes_rendered=cfg.eval.max_episodes_rendered,
|
||||
videos_dir=Path(cfg.output_dir) / "videos",
|
||||
start_seed=cfg.seed,
|
||||
max_parallel_tasks=cfg.env.max_parallel_tasks,
|
||||
|
||||
@@ -71,6 +71,9 @@ def update_policy(
|
||||
lr_scheduler=None,
|
||||
lock=None,
|
||||
rabc_weights_provider=None,
|
||||
*,
|
||||
do_optimizer_step: bool = True,
|
||||
loss_divisor: int = 1,
|
||||
) -> tuple[MetricsTracker, dict]:
|
||||
"""
|
||||
Performs a single training step to update the policy's weights.
|
||||
@@ -122,34 +125,38 @@ def update_policy(
|
||||
loss, output_dict = policy.forward(batch)
|
||||
|
||||
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
||||
logged_loss = loss.detach()
|
||||
if loss_divisor > 1:
|
||||
loss = loss / loss_divisor
|
||||
|
||||
# Use accelerator's backward method
|
||||
accelerator.backward(loss)
|
||||
|
||||
# Clip gradients if specified
|
||||
if grad_clip_norm > 0:
|
||||
grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
policy.parameters(), float("inf"), error_if_nonfinite=False
|
||||
)
|
||||
grad_norm_value = 0.0
|
||||
if do_optimizer_step:
|
||||
if grad_clip_norm > 0:
|
||||
grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
policy.parameters(), float("inf"), error_if_nonfinite=False
|
||||
)
|
||||
grad_norm_value = grad_norm.item()
|
||||
|
||||
# Optimizer step
|
||||
with lock if lock is not None else nullcontext():
|
||||
optimizer.step()
|
||||
with lock if lock is not None else nullcontext():
|
||||
optimizer.step()
|
||||
|
||||
optimizer.zero_grad()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Step through pytorch scheduler at every batch instead of epoch
|
||||
if lr_scheduler is not None:
|
||||
lr_scheduler.step()
|
||||
# Step through pytorch scheduler at every optimizer step instead of epoch
|
||||
if lr_scheduler is not None:
|
||||
lr_scheduler.step()
|
||||
|
||||
# Update internal buffers if policy has update method
|
||||
if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
|
||||
accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()
|
||||
# Update internal buffers if policy has update method
|
||||
if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
|
||||
accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()
|
||||
|
||||
train_metrics.loss = loss.item()
|
||||
train_metrics.grad_norm = grad_norm.item()
|
||||
train_metrics.loss = logged_loss.item()
|
||||
train_metrics.grad_norm = grad_norm_value
|
||||
train_metrics.lr = optimizer.param_groups[0]["lr"]
|
||||
train_metrics.update_s = time.perf_counter() - start_time
|
||||
return train_metrics, output_dict
|
||||
@@ -359,8 +366,16 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
|
||||
logging.info(f"{dataset.num_episodes=}")
|
||||
num_processes = accelerator.num_processes
|
||||
effective_bs = cfg.batch_size * num_processes
|
||||
logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
|
||||
micro_batch = cfg.batch_size
|
||||
logical_batch = cfg.batch_size * cfg.gradient_accumulation_steps
|
||||
effective_bs = logical_batch * num_processes
|
||||
logging.info(
|
||||
"Effective batch size: %s x %s x %s = %s",
|
||||
micro_batch,
|
||||
cfg.gradient_accumulation_steps,
|
||||
num_processes,
|
||||
effective_bs,
|
||||
)
|
||||
logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
|
||||
logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
|
||||
|
||||
@@ -407,9 +422,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
}
|
||||
|
||||
# Keep global batch size for logging; MetricsTracker handles world size internally.
|
||||
effective_batch_size = cfg.batch_size * accelerator.num_processes
|
||||
logical_batch_size = cfg.batch_size * cfg.gradient_accumulation_steps
|
||||
effective_batch_size = logical_batch_size * accelerator.num_processes
|
||||
train_tracker = MetricsTracker(
|
||||
cfg.batch_size,
|
||||
logical_batch_size,
|
||||
dataset.num_frames,
|
||||
dataset.num_episodes,
|
||||
train_metrics,
|
||||
@@ -431,21 +447,62 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
)
|
||||
|
||||
for _ in range(step, cfg.steps):
|
||||
start_time = time.perf_counter()
|
||||
batch = next(dl_iter)
|
||||
batch = preprocessor(batch)
|
||||
train_tracker.dataloading_s = time.perf_counter() - start_time
|
||||
step_dataloading_s = 0.0
|
||||
step_update_s = 0.0
|
||||
step_losses = []
|
||||
step_grad_norm = 0.0
|
||||
step_lr = optimizer.param_groups[0]["lr"]
|
||||
output_dict = {}
|
||||
optimizer.zero_grad()
|
||||
for accumulation_idx in range(cfg.gradient_accumulation_steps):
|
||||
start_time = time.perf_counter()
|
||||
batch = next(dl_iter)
|
||||
batch = preprocessor(batch)
|
||||
step_dataloading_s += time.perf_counter() - start_time
|
||||
|
||||
train_tracker, output_dict = update_policy(
|
||||
train_tracker,
|
||||
policy,
|
||||
batch,
|
||||
optimizer,
|
||||
cfg.optimizer.grad_clip_norm,
|
||||
accelerator=accelerator,
|
||||
lr_scheduler=lr_scheduler,
|
||||
rabc_weights_provider=rabc_weights,
|
||||
)
|
||||
is_last_microbatch = accumulation_idx == cfg.gradient_accumulation_steps - 1
|
||||
micro_metrics = MetricsTracker(
|
||||
cfg.batch_size,
|
||||
dataset.num_frames,
|
||||
dataset.num_episodes,
|
||||
{
|
||||
"loss": AverageMeter("loss", ":.3f"),
|
||||
"grad_norm": AverageMeter("grdn", ":.3f"),
|
||||
"lr": AverageMeter("lr", ":0.1e"),
|
||||
"update_s": AverageMeter("updt_s", ":.3f"),
|
||||
},
|
||||
accelerator=accelerator,
|
||||
)
|
||||
sync_context = (
|
||||
nullcontext()
|
||||
if is_last_microbatch or accelerator.num_processes == 1
|
||||
else accelerator.no_sync(policy)
|
||||
)
|
||||
with sync_context:
|
||||
micro_metrics, micro_output_dict = update_policy(
|
||||
micro_metrics,
|
||||
policy,
|
||||
batch,
|
||||
optimizer,
|
||||
cfg.optimizer.grad_clip_norm,
|
||||
accelerator=accelerator,
|
||||
lr_scheduler=lr_scheduler if is_last_microbatch else None,
|
||||
rabc_weights_provider=rabc_weights,
|
||||
do_optimizer_step=is_last_microbatch,
|
||||
loss_divisor=cfg.gradient_accumulation_steps,
|
||||
)
|
||||
step_update_s += micro_metrics.update_s.val
|
||||
step_losses.append(micro_metrics.loss.val)
|
||||
if is_last_microbatch:
|
||||
step_grad_norm = micro_metrics.grad_norm.val
|
||||
step_lr = micro_metrics.lr.val
|
||||
output_dict = micro_output_dict
|
||||
|
||||
train_tracker.loss = sum(step_losses) / len(step_losses)
|
||||
train_tracker.grad_norm = step_grad_norm
|
||||
train_tracker.lr = step_lr
|
||||
train_tracker.update_s = step_update_s
|
||||
train_tracker.dataloading_s = step_dataloading_s
|
||||
|
||||
# Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
|
||||
# increment `step` here.
|
||||
@@ -510,7 +567,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
postprocessor=postprocessor,
|
||||
n_episodes=cfg.eval.n_episodes,
|
||||
videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
|
||||
max_episodes_rendered=4,
|
||||
max_episodes_rendered=cfg.eval.max_episodes_rendered,
|
||||
start_seed=cfg.seed,
|
||||
max_parallel_tasks=cfg.env.max_parallel_tasks,
|
||||
)
|
||||
@@ -541,7 +598,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
if wandb_logger:
|
||||
wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
|
||||
wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
|
||||
wandb_logger.log_video(eval_info["overall"]["video_paths"][0], step, mode="eval")
|
||||
video_paths = eval_info["overall"].get("video_paths", [])
|
||||
if video_paths:
|
||||
wandb_logger.log_video(video_paths[0], step, mode="eval")
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
|
||||
def utc_timestamp_slug(now: datetime | None = None) -> str:
|
||||
current = now or datetime.now(UTC)
|
||||
return current.strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
|
||||
def make_hub_file_url(repo_id: str, path_in_repo: str, repo_type: str = "dataset") -> str:
|
||||
prefix = "datasets/" if repo_type == "dataset" else ""
|
||||
return f"https://huggingface.co/{prefix}{repo_id}/resolve/main/{path_in_repo}"
|
||||
|
||||
|
||||
def write_json(path: Path, payload: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, indent=2, sort_keys=True))
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UploadTarget:
|
||||
local_path: Path
|
||||
path_in_repo: str
|
||||
|
||||
|
||||
def upload_targets(
|
||||
repo_id: str,
|
||||
targets: list[UploadTarget],
|
||||
*,
|
||||
repo_type: str = "dataset",
|
||||
token: str | None = None,
|
||||
private: bool | None = None,
|
||||
commit_message: str | None = None,
|
||||
) -> dict[str, str]:
|
||||
api = HfApi(token=token)
|
||||
api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
||||
uploaded: dict[str, str] = {}
|
||||
for target in targets:
|
||||
api.upload_file(
|
||||
path_or_fileobj=str(target.local_path),
|
||||
path_in_repo=target.path_in_repo,
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
commit_message=commit_message or f"Upload {target.path_in_repo}",
|
||||
)
|
||||
uploaded[target.path_in_repo] = make_hub_file_url(repo_id, target.path_in_repo, repo_type=repo_type)
|
||||
return uploaded
|
||||
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
|
||||
from benchmarks.run_benchmark_matrix import (
|
||||
PlannedJob,
|
||||
compute_gradient_accumulation_steps,
|
||||
plan_jobs,
|
||||
render_sbatch_script,
|
||||
write_manifest,
|
||||
)
|
||||
|
||||
|
||||
def _one_job(job_list: list[PlannedJob]) -> PlannedJob:
|
||||
assert len(job_list) == 1
|
||||
return job_list[0]
|
||||
|
||||
|
||||
def test_compute_gradient_accumulation_steps_for_fixed_effective_batch():
|
||||
assert compute_gradient_accumulation_steps(
|
||||
effective_batch_size=256,
|
||||
num_gpus=8,
|
||||
microbatch_per_gpu=32,
|
||||
) == 1
|
||||
assert compute_gradient_accumulation_steps(
|
||||
effective_batch_size=256,
|
||||
num_gpus=4,
|
||||
microbatch_per_gpu=32,
|
||||
) == 2
|
||||
assert compute_gradient_accumulation_steps(
|
||||
effective_batch_size=256,
|
||||
num_gpus=1,
|
||||
microbatch_per_gpu=32,
|
||||
) == 8
|
||||
|
||||
|
||||
def test_plan_jobs_filters_libero_plus_only(tmp_path):
|
||||
jobs = plan_jobs(
|
||||
output_dir=tmp_path,
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
policies=["pi0", "act"],
|
||||
benchmarks=["libero_plus"],
|
||||
)
|
||||
|
||||
assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"]
|
||||
assert [job.policy for job in jobs] == ["pi0", "act"]
|
||||
|
||||
|
||||
def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path):
|
||||
jobs = plan_jobs(
|
||||
output_dir=tmp_path,
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
policies=["pi0"],
|
||||
benchmarks=["libero_plus", "robomme"],
|
||||
)
|
||||
|
||||
assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"]
|
||||
assert jobs[0].effective_batch_size == 256
|
||||
assert jobs[1].effective_batch_size == 256
|
||||
|
||||
|
||||
def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path):
|
||||
jobs = plan_jobs(
|
||||
output_dir=tmp_path,
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
policies=["pi0", "xvla", "act"],
|
||||
benchmarks=["robomme"],
|
||||
)
|
||||
by_policy = {job.policy: job for job in jobs}
|
||||
|
||||
assert by_policy["pi0"].num_gpus == 8
|
||||
assert by_policy["pi0"].gradient_accumulation_steps == 1
|
||||
assert by_policy["xvla"].num_gpus == 4
|
||||
assert by_policy["xvla"].gradient_accumulation_steps == 2
|
||||
assert by_policy["act"].num_gpus == 1
|
||||
assert by_policy["act"].gradient_accumulation_steps == 8
|
||||
|
||||
|
||||
def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path):
|
||||
job = _one_job(
|
||||
plan_jobs(
|
||||
output_dir=tmp_path,
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
policies=["pi0_fast"],
|
||||
benchmarks=["robomme"],
|
||||
)
|
||||
)
|
||||
|
||||
script = render_sbatch_script(
|
||||
job=job,
|
||||
output_dir=tmp_path,
|
||||
results_repo_id="lerobot/benchmark-history",
|
||||
git_commit="deadbeef",
|
||||
)
|
||||
|
||||
assert "docker/Dockerfile" not in script
|
||||
assert "lerobot-benchmark-robomme:latest" in script
|
||||
assert '--dataset.repo_id="lerobot/robomme"' in script
|
||||
assert '--env.type="robomme"' in script
|
||||
assert "--gradient_accumulation_steps=1" in script
|
||||
assert "lerobot-train-tokenizer" in script
|
||||
assert "benchmarks/publish_benchmark_result.py" in script
|
||||
|
||||
|
||||
def test_write_manifest_records_job_metadata(tmp_path):
|
||||
jobs = plan_jobs(
|
||||
output_dir=tmp_path,
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
policies=["pi0"],
|
||||
benchmarks=["libero_plus", "robomme"],
|
||||
)
|
||||
manifest_path = write_manifest(
|
||||
output_dir=tmp_path,
|
||||
jobs=jobs,
|
||||
git_commit="deadbeef",
|
||||
hub_org="lerobot",
|
||||
results_repo="lerobot/benchmark-history",
|
||||
)
|
||||
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
assert manifest["git_commit"] == "deadbeef"
|
||||
assert manifest["results_repo"] == "lerobot/benchmark-history"
|
||||
assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"]
|
||||
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from types import ModuleType
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _install_robomme_stub():
|
||||
stub = ModuleType("robomme")
|
||||
wrapper_stub = ModuleType("robomme.env_record_wrapper")
|
||||
|
||||
class FakeBuilder:
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def make_env_for_episode(self, episode_idx: int, max_steps: int):
|
||||
env = MagicMock()
|
||||
obs = {
|
||||
"front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
|
||||
"wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
|
||||
"joint_state_list": [np.zeros(7, dtype=np.float32)],
|
||||
"gripper_state_list": [np.zeros(2, dtype=np.float32)],
|
||||
}
|
||||
env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
|
||||
env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
|
||||
return env
|
||||
|
||||
wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
|
||||
stub.env_record_wrapper = wrapper_stub
|
||||
sys.modules["robomme"] = stub
|
||||
sys.modules["robomme.env_record_wrapper"] = wrapper_stub
|
||||
|
||||
|
||||
def _uninstall_robomme_stub():
|
||||
sys.modules.pop("robomme", None)
|
||||
sys.modules.pop("robomme.env_record_wrapper", None)
|
||||
|
||||
|
||||
def test_robomme_env_config_defaults():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
|
||||
cfg = RoboMMEEnv()
|
||||
assert cfg.task == "PickXtimes"
|
||||
assert cfg.fps == 10
|
||||
assert cfg.episode_length == 300
|
||||
assert cfg.action_space == "joint_angle"
|
||||
assert cfg.dataset_split == "test"
|
||||
assert cfg.task_ids is None
|
||||
|
||||
|
||||
def test_robomme_features_map():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
|
||||
|
||||
cfg = RoboMMEEnv()
|
||||
assert cfg.features_map[ACTION] == ACTION
|
||||
assert cfg.features_map["image"] == f"{OBS_IMAGES}.image"
|
||||
assert cfg.features_map["wrist_image"] == f"{OBS_IMAGES}.wrist_image"
|
||||
assert cfg.features_map[OBS_STATE] == OBS_STATE
|
||||
|
||||
|
||||
def test_convert_obs_list_format():
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import RoboMMEGymEnv
|
||||
|
||||
env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
|
||||
|
||||
front = np.full((256, 256, 3), 42, dtype=np.uint8)
|
||||
wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
|
||||
joints = np.arange(7, dtype=np.float32)
|
||||
gripper = np.array([0.5, 0.5], dtype=np.float32)
|
||||
|
||||
obs_raw = {
|
||||
"front_rgb_list": [np.zeros_like(front), front],
|
||||
"wrist_rgb_list": [np.zeros_like(wrist), wrist],
|
||||
"joint_state_list": [np.zeros(7, dtype=np.float32), joints],
|
||||
"gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
|
||||
}
|
||||
|
||||
result = env._convert_obs(obs_raw)
|
||||
np.testing.assert_array_equal(result["image"], front)
|
||||
np.testing.assert_array_equal(result["wrist_image"], wrist)
|
||||
assert result["state"].shape == (8,)
|
||||
np.testing.assert_array_almost_equal(result["state"][:7], joints)
|
||||
assert result["state"][7] == gripper[0]
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
|
||||
|
||||
def test_create_robomme_envs_multi_task():
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
env_cls = MagicMock(return_value=MagicMock())
|
||||
result = create_robomme_envs(
|
||||
task="PickXtimes,BinFill,StopCube",
|
||||
n_envs=1,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
Reference in New Issue
Block a user