lerobot/benchmarks/run_benchmark_matrix.py

#!/usr/bin/env python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""

from __future__ import annotations

import argparse
import json
import math
import subprocess
from dataclasses import asdict, dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from lerobot.utils.history_repo import utc_timestamp_slug

MAX_GPUS = 8
MIN_GPUS = 1
DEFAULT_STEPS = 20_000
DEFAULT_EFFECTIVE_BATCH_SIZE = 256
DEFAULT_MICROBATCH_PER_GPU = 32
DEFAULT_EVAL_BATCH_SIZE = 1
DEFAULT_CPUS_PER_GPU = 8
DEFAULT_MEMORY_PER_GPU_GB = 40


@dataclass(frozen=True)
class BenchmarkSpec:
    name: str
    dataset_repo_id: str
    docker_image: str
    eval_env_type: str
    eval_task: str
    eval_n_episodes: int
    train_steps: int = DEFAULT_STEPS
    effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
    train_extra_args: dict[str, Any] = field(default_factory=dict)
    eval_extra_args: dict[str, Any] = field(default_factory=dict)


@dataclass(frozen=True)
class PolicySpec:
    name: str
    policy_type: str
    num_gpus: int
    policy_path: str | None = None
    microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
    extra_train_args: dict[str, Any] = field(default_factory=dict)
    extra_eval_args: dict[str, Any] = field(default_factory=dict)
    needs_tokenizer: bool = False
    tokenizer_args: dict[str, Any] = field(default_factory=dict)


@dataclass(frozen=True)
class PlannedJob:
    benchmark: str
    policy: str
    run_rel: str
    num_gpus: int
    microbatch_per_gpu: int
    gradient_accumulation_steps: int
    effective_batch_size: int
    docker_image: str
    train_args: dict[str, Any]
    eval_args: dict[str, Any]
    tokenizer_args: dict[str, Any] | None
    script_path: str


BENCHMARKS: dict[str, BenchmarkSpec] = {
    "libero_plus": BenchmarkSpec(
        name="libero_plus",
        dataset_repo_id="lerobot/libero_plus",
        docker_image="lerobot-benchmark-libero-plus:latest",
        eval_env_type="libero_plus",
        eval_task="libero_spatial,libero_object,libero_goal,libero_10",
        eval_n_episodes=10,
        train_extra_args={
            "rename_map": {
                "observation.images.image": "observation.images.camera1",
                "observation.images.image2": "observation.images.camera2",
            },
        },
        eval_extra_args={
            "env.camera_name_mapping": {
                "agentview_image": "camera1",
                "robot0_eye_in_hand_image": "camera2",
            },
            "env.max_parallel_tasks": 1,
            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
            "eval.use_async_envs": False,
            "eval.max_episodes_rendered": 0,
            "policy.device": "cuda",
        },
    ),
    "robomme": BenchmarkSpec(
        name="robomme",
        dataset_repo_id="lerobot/robomme",
        docker_image="lerobot-benchmark-robomme:latest",
        eval_env_type="robomme",
        eval_task=(
            "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
            "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
            "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
        ),
        eval_n_episodes=50,
        train_extra_args={
            "rename_map": {
                "observation.images.image": "observation.images.camera1",
                "observation.images.wrist_image": "observation.images.camera2",
            },
        },
        eval_extra_args={
            "env.dataset_split": "test",
            "env.max_parallel_tasks": 1,
            "rename_map": {
                "observation.images.image": "observation.images.camera1",
                "observation.images.wrist_image": "observation.images.camera2",
            },
            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
            "eval.use_async_envs": False,
            "eval.max_episodes_rendered": 0,
            "policy.device": "cuda",
        },
    ),
}


POLICIES: dict[str, PolicySpec] = {
    "pi0": PolicySpec(
        name="pi0",
        policy_type="pi0",
        policy_path="lerobot/pi0_base",
        num_gpus=8,
        extra_train_args={
            "policy.n_action_steps": 30,
            "policy.scheduler_decay_steps": DEFAULT_STEPS,
            "policy.empty_cameras": 0,
        },
    ),
    "pi0_fast": PolicySpec(
        name="pi0_fast",
        policy_type="pi0_fast",
        policy_path="lerobot/pi0fast-base",
        num_gpus=8,
        extra_train_args={
            "policy.n_action_steps": 30,
            "policy.scheduler_decay_steps": DEFAULT_STEPS,
            "policy.empty_cameras": 0,
        },
        needs_tokenizer=True,
        tokenizer_args={
            "action_horizon": 30,
            "encoded_dims": "0:7",
            "normalization_mode": "QUANTILES",
            "vocab_size": 1024,
            "scale": 10.0,
            "push_to_hub": True,
        },
    ),
    "pi05": PolicySpec(
        name="pi05",
        policy_type="pi05",
        policy_path="lerobot/pi05_base",
        num_gpus=8,
        extra_train_args={
            "policy.n_action_steps": 30,
            "policy.scheduler_decay_steps": DEFAULT_STEPS,
            "policy.empty_cameras": 0,
        },
    ),
    "groot": PolicySpec(
        name="groot",
        policy_type="groot",
        num_gpus=8,
        extra_train_args={
            "policy.n_action_steps": 30,
            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
            "policy.tune_diffusion_model": True,
            "policy.tune_projector": True,
            "policy.tune_llm": False,
            "policy.tune_visual": False,
            "policy.use_bf16": True,
        },
    ),
    "act": PolicySpec(
        name="act",
        policy_type="act",
        num_gpus=1,
        extra_train_args={
            "policy.n_action_steps": 30,
        },
    ),
    "diffusion": PolicySpec(
        name="diffusion",
        policy_type="diffusion",
        num_gpus=1,
        extra_train_args={
            "policy.horizon": 32,
            "policy.n_action_steps": 30,
            "policy.n_obs_steps": 2,
        },
    ),
    "smolvla": PolicySpec(
        name="smolvla",
        policy_type="smolvla",
        policy_path="lerobot/smolvla_base",
        num_gpus=8,
        extra_train_args={
            "policy.n_action_steps": 30,
            "policy.load_vlm_weights": True,
            "policy.freeze_vision_encoder": False,
            "policy.train_expert_only": False,
            "policy.scheduler_decay_steps": DEFAULT_STEPS,
            "policy.empty_cameras": 1,
        },
    ),
    "xvla": PolicySpec(
        name="xvla",
        policy_type="xvla",
        policy_path="lerobot/xvla-widowx",
        num_gpus=4,
        extra_train_args={
            "policy.n_action_steps": 32,
            "policy.scheduler_decay_steps": DEFAULT_STEPS,
            "policy.empty_cameras": 1,
        },
    ),
    "multi_task_dit": PolicySpec(
        name="multi_task_dit",
        policy_type="multi_task_dit",
        num_gpus=1,
        extra_train_args={
            "policy.horizon": 32,
            "policy.n_action_steps": 30,
        },
    ),
}


def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
    return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"


def get_requested_names(
    requested: list[str] | None,
    available: dict[str, Any],
    *,
    kind: str,
) -> list[str]:
    if not requested:
        return list(available)
    unknown = sorted(set(requested) - set(available))
    if unknown:
        raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
    return requested


def compute_gradient_accumulation_steps(
    *,
    effective_batch_size: int,
    num_gpus: int,
    microbatch_per_gpu: int,
) -> int:
    per_step_batch = num_gpus * microbatch_per_gpu
    if effective_batch_size % per_step_batch != 0:
        raise ValueError(
            f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
            f"{microbatch_per_gpu=}."
        )
    return effective_batch_size // per_step_batch


def make_run_slug() -> str:
    return utc_timestamp_slug()


def shell_value(value: Any) -> str:
    if isinstance(value, bool):
        value = "true" if value else "false"
    elif isinstance(value, (dict, list)):
        value = json.dumps(value, sort_keys=True)
    else:
        value = str(value)
    escaped = (
        value.replace("\\", "\\\\")
        .replace('"', '\\"')
        .replace("$", "\\$")
        .replace("`", "\\`")
    )
    return f'"{escaped}"'


def format_cli_args(args: dict[str, Any]) -> str:
    lines = []
    for key, value in args.items():
        lines.append(f"  --{key}={shell_value(value)}")
    return " \\\n".join(lines)


def build_train_args(
    *,
    benchmark: BenchmarkSpec,
    policy: PolicySpec,
    train_dir: str,
    gradient_accumulation_steps: int,
) -> dict[str, Any]:
    args: dict[str, Any] = {
        "dataset.repo_id": benchmark.dataset_repo_id,
        "output_dir": train_dir,
        "steps": benchmark.train_steps,
        "batch_size": policy.microbatch_per_gpu,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "eval_freq": 0,
        "save_freq": benchmark.train_steps,
        "save_checkpoint": True,
        "log_freq": 100,
        "wandb.enable": False,
        "policy.push_to_hub": False,
        "policy.device": "cuda",
    }
    if policy.policy_path:
        args["policy.path"] = policy.policy_path
    else:
        args["policy.type"] = policy.policy_type
    args.update(benchmark.train_extra_args)
    args.update(policy.extra_train_args)
    return args


def build_eval_args(
    *,
    benchmark: BenchmarkSpec,
    policy: PolicySpec,
    checkpoint_path: str,
    eval_dir: str,
) -> dict[str, Any]:
    args: dict[str, Any] = {
        "policy.path": checkpoint_path,
        "env.type": benchmark.eval_env_type,
        "env.task": benchmark.eval_task,
        "eval.n_episodes": benchmark.eval_n_episodes,
        "output_dir": eval_dir,
    }
    args.update(benchmark.eval_extra_args)
    args.update(policy.extra_eval_args)
    return args


def plan_jobs(
    *,
    output_dir: Path,
    hub_org: str,
    results_repo: str,
    policies: list[str],
    benchmarks: list[str],
) -> list[PlannedJob]:
    _ = hub_org
    _ = results_repo
    scripts_dir = output_dir / "slurm"
    jobs: list[PlannedJob] = []
    for benchmark_name in benchmarks:
        benchmark = BENCHMARKS[benchmark_name]
        for policy_name in policies:
            policy = POLICIES[policy_name]
            num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
            run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
            run_root = f"/benchmark-output/{run_rel}"
            gradient_accumulation_steps = compute_gradient_accumulation_steps(
                effective_batch_size=benchmark.effective_batch_size,
                num_gpus=num_gpus,
                microbatch_per_gpu=policy.microbatch_per_gpu,
            )
            train_dir = f"{run_root}/train"
            checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
            eval_dir = f"{run_root}/eval"
            train_args = build_train_args(
                benchmark=benchmark,
                policy=policy,
                train_dir=train_dir,
                gradient_accumulation_steps=gradient_accumulation_steps,
            )
            eval_args = build_eval_args(
                benchmark=benchmark,
                policy=policy,
                checkpoint_path=checkpoint_path,
                eval_dir=eval_dir,
            )
            tokenizer_args = None
            if policy.needs_tokenizer:
                tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
                tokenizer_args = {
                    "repo_id": benchmark.dataset_repo_id,
                    "output_dir": f"{run_root}/tokenizer",
                    "hub_repo_id": tokenizer_repo_id,
                    **policy.tokenizer_args,
                }
                train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
            script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
            jobs.append(
                PlannedJob(
                    benchmark=benchmark_name,
                    policy=policy_name,
                    run_rel=run_rel,
                    num_gpus=num_gpus,
                    microbatch_per_gpu=policy.microbatch_per_gpu,
                    gradient_accumulation_steps=gradient_accumulation_steps,
                    effective_batch_size=benchmark.effective_batch_size,
                    docker_image=benchmark.docker_image,
                    train_args=train_args,
                    eval_args=eval_args,
                    tokenizer_args=tokenizer_args,
                    script_path=script_path,
                )
            )
    return jobs


def render_sbatch_script(
    *,
    job: PlannedJob,
    output_dir: Path,
    results_repo_id: str,
    git_commit: str,
) -> str:
    host_output_dir = output_dir.resolve()
    run_root = f"/benchmark-output/{job.run_rel}"
    host_run_root = host_output_dir / job.run_rel
    cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
    mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
    gpu_ids_expr = "${GPU_IDS}"
    train_cli = format_cli_args(job.train_args)
    eval_cli = format_cli_args(job.eval_args)
    tokenizer_command = ""
    if job.tokenizer_args:
        tokenizer_cli = format_cli_args(job.tokenizer_args)
        tokenizer_command = f"""
docker run --rm --gpus all \\
  --shm-size=16g \\
  -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_HOME=/tmp/hf \\
  -v "{host_output_dir}:/benchmark-output" \\
  -w /lerobot \\
  "{job.docker_image}" \\
  bash -lc '
    set -euo pipefail
    if [[ -n "${{HF_TOKEN:-}}" ]]; then
      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
    fi
    lerobot-train-tokenizer \\
{tokenizer_cli}
  '
"""
    return f"""#!/bin/bash
#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
#SBATCH --gres=gpu:{job.num_gpus}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={mem_gb}G
#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err

set -euo pipefail

HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
RUN_ROOT="{run_root}"

mkdir -p "{host_output_dir}/logs"
mkdir -p "{host_run_root.parent}"

{tokenizer_command}

TRAIN_START="$(date +%s)"
docker run --rm --gpus all \\
  --shm-size=16g \\
  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_HOME=/tmp/hf \\
  -v "{host_output_dir}:/benchmark-output" \\
  -w /lerobot \\
  "{job.docker_image}" \\
  bash -lc '
    set -euo pipefail
    if [[ -n "${{HF_TOKEN:-}}" ]]; then
      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
    fi
    accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
{train_cli}
  '
TRAIN_END="$(date +%s)"

EVAL_START="$(date +%s)"
docker run --rm --gpus all \\
  --shm-size=16g \\
  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_HOME=/tmp/hf \\
  -v "{host_output_dir}:/benchmark-output" \\
  -w /lerobot \\
  "{job.docker_image}" \\
  bash -lc '
    set -euo pipefail
    if [[ -n "${{HF_TOKEN:-}}" ]]; then
      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
    fi
    lerobot-eval \\
{eval_cli}
  '
EVAL_END="$(date +%s)"
TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"

docker run --rm --gpus all \\
  --shm-size=16g \\
  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
  -e HF_HOME=/tmp/hf \\
  -e RUN_ROOT="${{RUN_ROOT}}" \\
  -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
  -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
  -v "{host_output_dir}:/benchmark-output" \\
  -w /lerobot \\
  "{job.docker_image}" \\
  bash -lc '
    set -euo pipefail
    if [[ -n "${{HF_TOKEN:-}}" ]]; then
      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
    fi
    uv run python benchmarks/publish_benchmark_result.py \\
      --benchmark={job.benchmark} \\
      --policy={job.policy} \\
      --run_root="${{RUN_ROOT}}" \\
      --results_repo={results_repo_id} \\
      --git_commit={git_commit} \\
      --num_gpus={job.num_gpus} \\
      --microbatch_per_gpu={job.microbatch_per_gpu} \\
      --gradient_accumulation_steps={job.gradient_accumulation_steps} \\
      --effective_batch_size={job.effective_batch_size} \\
      --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
      --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
      --slurm_job_id="${{SLURM_JOB_ID:-}}" \\
      --docker_image={job.docker_image}
  '
"""


def write_manifest(
    *,
    output_dir: Path,
    jobs: list[PlannedJob],
    git_commit: str,
    hub_org: str,
    results_repo: str,
) -> Path:
    manifest = {
        "generated_at": datetime.now(UTC).isoformat(),
        "git_commit": git_commit,
        "hub_org": hub_org,
        "results_repo": results_repo,
        "jobs": [asdict(job) for job in jobs],
    }
    manifest_path = output_dir / "manifest.json"
    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
    return manifest_path


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--policies", nargs="*", default=None)
    parser.add_argument("--benchmarks", nargs="*", default=None)
    parser.add_argument("--output_dir", required=True, type=Path)
    parser.add_argument("--hub_org", required=True)
    parser.add_argument("--results_repo", required=True)
    parser.add_argument("--submit", action="store_true")
    return parser.parse_args()


def get_git_commit() -> str:
    return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()


def main() -> int:
    args = parse_args()
    args.output_dir.mkdir(parents=True, exist_ok=True)
    (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
    (args.output_dir / "logs").mkdir(parents=True, exist_ok=True)

    selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
    selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
    git_commit = get_git_commit()
    results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)

    jobs = plan_jobs(
        output_dir=args.output_dir,
        hub_org=args.hub_org,
        results_repo=results_repo_id,
        policies=selected_policies,
        benchmarks=selected_benchmarks,
    )

    for job in jobs:
        script = render_sbatch_script(
            job=job,
            output_dir=args.output_dir,
            results_repo_id=results_repo_id,
            git_commit=git_commit,
        )
        script_path = Path(job.script_path)
        script_path.write_text(script)
        script_path.chmod(0o755)
        if args.submit:
            subprocess.run(["sbatch", str(script_path)], check=True)

    manifest_path = write_manifest(
        output_dir=args.output_dir,
        jobs=jobs,
        git_commit=git_commit,
        hub_org=args.hub_org,
        results_repo=results_repo_id,
    )
    print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
    print(f"Manifest: {manifest_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())