Files
lerobot/benchmarks/run_benchmark_matrix.py
T

648 lines
20 KiB
Python

#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
from __future__ import annotations
import argparse
import json
import math
import subprocess
from dataclasses import asdict, dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from lerobot.utils.history_repo import utc_timestamp_slug
MAX_GPUS = 8
MIN_GPUS = 1
DEFAULT_STEPS = 20_000
DEFAULT_EFFECTIVE_BATCH_SIZE = 256
DEFAULT_MICROBATCH_PER_GPU = 32
DEFAULT_EVAL_BATCH_SIZE = 1
DEFAULT_CPUS_PER_GPU = 8
DEFAULT_MEMORY_PER_GPU_GB = 40
@dataclass(frozen=True)
class BenchmarkSpec:
name: str
dataset_repo_id: str
docker_image: str
eval_env_type: str
eval_task: str
eval_n_episodes: int
train_steps: int = DEFAULT_STEPS
effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
train_extra_args: dict[str, Any] = field(default_factory=dict)
eval_extra_args: dict[str, Any] = field(default_factory=dict)
@dataclass(frozen=True)
class PolicySpec:
name: str
policy_type: str
num_gpus: int
policy_path: str | None = None
microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
extra_train_args: dict[str, Any] = field(default_factory=dict)
extra_eval_args: dict[str, Any] = field(default_factory=dict)
needs_tokenizer: bool = False
tokenizer_args: dict[str, Any] = field(default_factory=dict)
@dataclass(frozen=True)
class PlannedJob:
benchmark: str
policy: str
run_rel: str
num_gpus: int
microbatch_per_gpu: int
gradient_accumulation_steps: int
effective_batch_size: int
docker_image: str
train_args: dict[str, Any]
eval_args: dict[str, Any]
tokenizer_args: dict[str, Any] | None
script_path: str
BENCHMARKS: dict[str, BenchmarkSpec] = {
"libero_plus": BenchmarkSpec(
name="libero_plus",
dataset_repo_id="lerobot/libero_plus",
docker_image="lerobot-benchmark-libero-plus:latest",
eval_env_type="libero_plus",
eval_task="libero_spatial,libero_object,libero_goal,libero_10",
eval_n_episodes=10,
train_extra_args={
"rename_map": {
"observation.images.image": "observation.images.camera1",
"observation.images.image2": "observation.images.camera2",
},
},
eval_extra_args={
"env.camera_name_mapping": {
"agentview_image": "camera1",
"robot0_eye_in_hand_image": "camera2",
},
"env.max_parallel_tasks": 1,
"eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
"eval.use_async_envs": False,
"eval.max_episodes_rendered": 0,
"policy.device": "cuda",
},
),
"robomme": BenchmarkSpec(
name="robomme",
dataset_repo_id="lerobot/robomme",
docker_image="lerobot-benchmark-robomme:latest",
eval_env_type="robomme",
eval_task=(
"BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
"ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
"VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
),
eval_n_episodes=50,
train_extra_args={
"rename_map": {
"observation.images.image": "observation.images.camera1",
"observation.images.wrist_image": "observation.images.camera2",
},
},
eval_extra_args={
"env.dataset_split": "test",
"env.max_parallel_tasks": 1,
"rename_map": {
"observation.images.image": "observation.images.camera1",
"observation.images.wrist_image": "observation.images.camera2",
},
"eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
"eval.use_async_envs": False,
"eval.max_episodes_rendered": 0,
"policy.device": "cuda",
},
),
}
POLICIES: dict[str, PolicySpec] = {
"pi0": PolicySpec(
name="pi0",
policy_type="pi0",
policy_path="lerobot/pi0_base",
num_gpus=8,
extra_train_args={
"policy.n_action_steps": 30,
"policy.scheduler_decay_steps": DEFAULT_STEPS,
"policy.empty_cameras": 0,
},
),
"pi0_fast": PolicySpec(
name="pi0_fast",
policy_type="pi0_fast",
policy_path="lerobot/pi0fast-base",
num_gpus=8,
extra_train_args={
"policy.n_action_steps": 30,
"policy.scheduler_decay_steps": DEFAULT_STEPS,
"policy.empty_cameras": 0,
},
needs_tokenizer=True,
tokenizer_args={
"action_horizon": 30,
"encoded_dims": "0:7",
"normalization_mode": "QUANTILES",
"vocab_size": 1024,
"scale": 10.0,
"push_to_hub": True,
},
),
"pi05": PolicySpec(
name="pi05",
policy_type="pi05",
policy_path="lerobot/pi05_base",
num_gpus=8,
extra_train_args={
"policy.n_action_steps": 30,
"policy.scheduler_decay_steps": DEFAULT_STEPS,
"policy.empty_cameras": 0,
},
),
"groot": PolicySpec(
name="groot",
policy_type="groot",
num_gpus=8,
extra_train_args={
"policy.n_action_steps": 30,
"policy.base_model_path": "nvidia/GR00T-N1.5-3B",
"policy.tune_diffusion_model": True,
"policy.tune_projector": True,
"policy.tune_llm": False,
"policy.tune_visual": False,
"policy.use_bf16": True,
},
),
"act": PolicySpec(
name="act",
policy_type="act",
num_gpus=1,
extra_train_args={
"policy.n_action_steps": 30,
},
),
"diffusion": PolicySpec(
name="diffusion",
policy_type="diffusion",
num_gpus=1,
extra_train_args={
"policy.horizon": 32,
"policy.n_action_steps": 30,
"policy.n_obs_steps": 2,
},
),
"smolvla": PolicySpec(
name="smolvla",
policy_type="smolvla",
policy_path="lerobot/smolvla_base",
num_gpus=8,
extra_train_args={
"policy.n_action_steps": 30,
"policy.load_vlm_weights": True,
"policy.freeze_vision_encoder": False,
"policy.train_expert_only": False,
"policy.scheduler_decay_steps": DEFAULT_STEPS,
"policy.empty_cameras": 1,
},
),
"xvla": PolicySpec(
name="xvla",
policy_type="xvla",
policy_path="lerobot/xvla-widowx",
num_gpus=4,
extra_train_args={
"policy.n_action_steps": 32,
"policy.scheduler_decay_steps": DEFAULT_STEPS,
"policy.empty_cameras": 1,
},
),
"multi_task_dit": PolicySpec(
name="multi_task_dit",
policy_type="multi_task_dit",
num_gpus=1,
extra_train_args={
"policy.horizon": 32,
"policy.n_action_steps": 30,
},
),
}
def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
def get_requested_names(
requested: list[str] | None,
available: dict[str, Any],
*,
kind: str,
) -> list[str]:
if not requested:
return list(available)
unknown = sorted(set(requested) - set(available))
if unknown:
raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
return requested
def compute_gradient_accumulation_steps(
*,
effective_batch_size: int,
num_gpus: int,
microbatch_per_gpu: int,
) -> int:
per_step_batch = num_gpus * microbatch_per_gpu
if effective_batch_size % per_step_batch != 0:
raise ValueError(
f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
f"{microbatch_per_gpu=}."
)
return effective_batch_size // per_step_batch
def make_run_slug() -> str:
return utc_timestamp_slug()
def shell_value(value: Any) -> str:
if isinstance(value, bool):
value = "true" if value else "false"
elif isinstance(value, (dict, list)):
value = json.dumps(value, sort_keys=True)
else:
value = str(value)
escaped = (
value.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("$", "\\$")
.replace("`", "\\`")
)
return f'"{escaped}"'
def format_cli_args(args: dict[str, Any]) -> str:
lines = []
for key, value in args.items():
lines.append(f" --{key}={shell_value(value)}")
return " \\\n".join(lines)
def build_train_args(
*,
benchmark: BenchmarkSpec,
policy: PolicySpec,
train_dir: str,
gradient_accumulation_steps: int,
) -> dict[str, Any]:
args: dict[str, Any] = {
"dataset.repo_id": benchmark.dataset_repo_id,
"output_dir": train_dir,
"steps": benchmark.train_steps,
"batch_size": policy.microbatch_per_gpu,
"gradient_accumulation_steps": gradient_accumulation_steps,
"eval_freq": 0,
"save_freq": benchmark.train_steps,
"save_checkpoint": True,
"log_freq": 100,
"wandb.enable": False,
"policy.push_to_hub": False,
"policy.device": "cuda",
}
if policy.policy_path:
args["policy.path"] = policy.policy_path
else:
args["policy.type"] = policy.policy_type
args.update(benchmark.train_extra_args)
args.update(policy.extra_train_args)
return args
def build_eval_args(
*,
benchmark: BenchmarkSpec,
policy: PolicySpec,
checkpoint_path: str,
eval_dir: str,
) -> dict[str, Any]:
args: dict[str, Any] = {
"policy.path": checkpoint_path,
"env.type": benchmark.eval_env_type,
"env.task": benchmark.eval_task,
"eval.n_episodes": benchmark.eval_n_episodes,
"output_dir": eval_dir,
}
args.update(benchmark.eval_extra_args)
args.update(policy.extra_eval_args)
return args
def plan_jobs(
*,
output_dir: Path,
hub_org: str,
results_repo: str,
policies: list[str],
benchmarks: list[str],
) -> list[PlannedJob]:
_ = hub_org
_ = results_repo
scripts_dir = output_dir / "slurm"
jobs: list[PlannedJob] = []
for benchmark_name in benchmarks:
benchmark = BENCHMARKS[benchmark_name]
for policy_name in policies:
policy = POLICIES[policy_name]
num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
run_root = f"/benchmark-output/{run_rel}"
gradient_accumulation_steps = compute_gradient_accumulation_steps(
effective_batch_size=benchmark.effective_batch_size,
num_gpus=num_gpus,
microbatch_per_gpu=policy.microbatch_per_gpu,
)
train_dir = f"{run_root}/train"
checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
eval_dir = f"{run_root}/eval"
train_args = build_train_args(
benchmark=benchmark,
policy=policy,
train_dir=train_dir,
gradient_accumulation_steps=gradient_accumulation_steps,
)
eval_args = build_eval_args(
benchmark=benchmark,
policy=policy,
checkpoint_path=checkpoint_path,
eval_dir=eval_dir,
)
tokenizer_args = None
if policy.needs_tokenizer:
tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
tokenizer_args = {
"repo_id": benchmark.dataset_repo_id,
"output_dir": f"{run_root}/tokenizer",
"hub_repo_id": tokenizer_repo_id,
**policy.tokenizer_args,
}
train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
jobs.append(
PlannedJob(
benchmark=benchmark_name,
policy=policy_name,
run_rel=run_rel,
num_gpus=num_gpus,
microbatch_per_gpu=policy.microbatch_per_gpu,
gradient_accumulation_steps=gradient_accumulation_steps,
effective_batch_size=benchmark.effective_batch_size,
docker_image=benchmark.docker_image,
train_args=train_args,
eval_args=eval_args,
tokenizer_args=tokenizer_args,
script_path=script_path,
)
)
return jobs
def render_sbatch_script(
*,
job: PlannedJob,
output_dir: Path,
results_repo_id: str,
git_commit: str,
) -> str:
host_output_dir = output_dir.resolve()
run_root = f"/benchmark-output/{job.run_rel}"
host_run_root = host_output_dir / job.run_rel
cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
gpu_ids_expr = "${GPU_IDS}"
train_cli = format_cli_args(job.train_args)
eval_cli = format_cli_args(job.eval_args)
tokenizer_command = ""
if job.tokenizer_args:
tokenizer_cli = format_cli_args(job.tokenizer_args)
tokenizer_command = f"""
docker run --rm --gpus all \\
--shm-size=16g \\
-e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_HOME=/tmp/hf \\
-v "{host_output_dir}:/benchmark-output" \\
-w /lerobot \\
"{job.docker_image}" \\
bash -lc '
set -euo pipefail
if [[ -n "${{HF_TOKEN:-}}" ]]; then
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
fi
lerobot-train-tokenizer \\
{tokenizer_cli}
'
"""
return f"""#!/bin/bash
#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
#SBATCH --gres=gpu:{job.num_gpus}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={mem_gb}G
#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
set -euo pipefail
HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
RUN_ROOT="{run_root}"
mkdir -p "{host_output_dir}/logs"
mkdir -p "{host_run_root.parent}"
{tokenizer_command}
TRAIN_START="$(date +%s)"
docker run --rm --gpus all \\
--shm-size=16g \\
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_HOME=/tmp/hf \\
-v "{host_output_dir}:/benchmark-output" \\
-w /lerobot \\
"{job.docker_image}" \\
bash -lc '
set -euo pipefail
if [[ -n "${{HF_TOKEN:-}}" ]]; then
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
fi
accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
{train_cli}
'
TRAIN_END="$(date +%s)"
EVAL_START="$(date +%s)"
docker run --rm --gpus all \\
--shm-size=16g \\
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_HOME=/tmp/hf \\
-v "{host_output_dir}:/benchmark-output" \\
-w /lerobot \\
"{job.docker_image}" \\
bash -lc '
set -euo pipefail
if [[ -n "${{HF_TOKEN:-}}" ]]; then
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
fi
lerobot-eval \\
{eval_cli}
'
EVAL_END="$(date +%s)"
TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
docker run --rm --gpus all \\
--shm-size=16g \\
-e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-e HF_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-e HF_HOME=/tmp/hf \\
-e RUN_ROOT="${{RUN_ROOT}}" \\
-e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
-e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
-v "{host_output_dir}:/benchmark-output" \\
-w /lerobot \\
"{job.docker_image}" \\
bash -lc '
set -euo pipefail
if [[ -n "${{HF_TOKEN:-}}" ]]; then
hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
fi
uv run python benchmarks/publish_benchmark_result.py \\
--benchmark={job.benchmark} \\
--policy={job.policy} \\
--run_root="${{RUN_ROOT}}" \\
--results_repo={results_repo_id} \\
--git_commit={git_commit} \\
--num_gpus={job.num_gpus} \\
--microbatch_per_gpu={job.microbatch_per_gpu} \\
--gradient_accumulation_steps={job.gradient_accumulation_steps} \\
--effective_batch_size={job.effective_batch_size} \\
--train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
--eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
--slurm_job_id="${{SLURM_JOB_ID:-}}" \\
--docker_image={job.docker_image}
'
"""
def write_manifest(
*,
output_dir: Path,
jobs: list[PlannedJob],
git_commit: str,
hub_org: str,
results_repo: str,
) -> Path:
manifest = {
"generated_at": datetime.now(UTC).isoformat(),
"git_commit": git_commit,
"hub_org": hub_org,
"results_repo": results_repo,
"jobs": [asdict(job) for job in jobs],
}
manifest_path = output_dir / "manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
return manifest_path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--policies", nargs="*", default=None)
parser.add_argument("--benchmarks", nargs="*", default=None)
parser.add_argument("--output_dir", required=True, type=Path)
parser.add_argument("--hub_org", required=True)
parser.add_argument("--results_repo", required=True)
parser.add_argument("--submit", action="store_true")
return parser.parse_args()
def get_git_commit() -> str:
return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
def main() -> int:
args = parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
(args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
(args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
git_commit = get_git_commit()
results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
jobs = plan_jobs(
output_dir=args.output_dir,
hub_org=args.hub_org,
results_repo=results_repo_id,
policies=selected_policies,
benchmarks=selected_benchmarks,
)
for job in jobs:
script = render_sbatch_script(
job=job,
output_dir=args.output_dir,
results_repo_id=results_repo_id,
git_commit=git_commit,
)
script_path = Path(job.script_path)
script_path.write_text(script)
script_path.chmod(0o755)
if args.submit:
subprocess.run(["sbatch", str(script_path)], check=True)
manifest_path = write_manifest(
output_dir=args.output_dir,
jobs=jobs,
git_commit=git_commit,
hub_org=args.hub_org,
results_repo=results_repo_id,
)
print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
print(f"Manifest: {manifest_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())