feat(benchmarks): add matrix runner and leaderboard

2026-07-22 09:21:53 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
@@ -0,0 +1 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
+
+
+def load_json_if_exists(path: Path) -> dict[str, Any] | None:
+    if not path.exists():
+        return None
+    return json.loads(path.read_text())
+
+
+def find_latest_train_config_path(run_root: Path) -> Path | None:
+    checkpoints_dir = run_root / "train" / "checkpoints"
+    if not checkpoints_dir.exists():
+        return None
+    candidates = sorted(
+        checkpoints_dir.glob("*/pretrained_model/train_config.json"),
+        key=lambda path: path.parts[-3],
+    )
+    return candidates[-1] if candidates else None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--benchmark", required=True)
+    parser.add_argument("--policy", required=True)
+    parser.add_argument("--run_root", required=True, type=Path)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--git_commit", required=True)
+    parser.add_argument("--num_gpus", required=True, type=int)
+    parser.add_argument("--microbatch_per_gpu", required=True, type=int)
+    parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
+    parser.add_argument("--effective_batch_size", required=True, type=int)
+    parser.add_argument("--train_wall_time_s", required=True, type=float)
+    parser.add_argument("--eval_wall_time_s", required=True, type=float)
+    parser.add_argument("--slurm_job_id", default="")
+    parser.add_argument("--docker_image", required=True)
+    return parser.parse_args()
+
+
+def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
+    now = datetime.now(UTC)
+    created_at = now.isoformat()
+    timestamp = utc_timestamp_slug(now)
+    run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
+    eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
+    train_config_path = find_latest_train_config_path(args.run_root)
+    train_config = load_json_if_exists(train_config_path) or {}
+
+    artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
+    row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
+
+    row = {
+        "schema_version": 1,
+        "created_at": created_at,
+        "run_id": run_id,
+        "benchmark": args.benchmark,
+        "policy": args.policy,
+        "git_commit": args.git_commit,
+        "slurm_job_id": args.slurm_job_id or None,
+        "docker_image": args.docker_image,
+        "resources": {
+            "num_gpus": args.num_gpus,
+            "microbatch_per_gpu": args.microbatch_per_gpu,
+            "gradient_accumulation_steps": args.gradient_accumulation_steps,
+            "effective_batch_size": args.effective_batch_size,
+        },
+        "timings": {
+            "train_wall_time_s": args.train_wall_time_s,
+            "eval_wall_time_s": args.eval_wall_time_s,
+            "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
+        },
+        "eval": {
+            "overall": eval_info.get("overall", {}),
+            "per_group": eval_info.get("per_group", {}),
+            "per_task_count": len(eval_info.get("per_task", [])),
+        },
+        "paths": {
+            "run_root": str(args.run_root),
+            "train_dir": str(args.run_root / "train"),
+            "eval_dir": str(args.run_root / "eval"),
+        },
+        "train_config": train_config,
+        "artifact_urls": {
+            "row": make_hub_file_url(args.results_repo, row_path_in_repo),
+        },
+    }
+
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.parent.mkdir(parents=True, exist_ok=True)
+    upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
+
+    eval_info_path = args.run_root / "eval" / "eval_info.json"
+    if eval_info_path.exists():
+        row["artifact_urls"]["eval_info"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/eval_info.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
+        )
+
+    if train_config_path is not None and train_config_path.exists():
+        row["artifact_urls"]["train_config"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/train_config.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
+        )
+
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    return row, upload_list
+
+
+def main() -> int:
+    args = parse_args()
+    row, upload_list = build_row(args)
+    uploaded = upload_targets(
+        repo_id=args.results_repo,
+        targets=upload_list,
+        repo_type="dataset",
+        private=False,
+        commit_message=f"Add benchmark row {row['run_id']}",
+    )
+    row["uploaded_paths"] = uploaded
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    print(json.dumps(row, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import subprocess
+from dataclasses import asdict, dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import utc_timestamp_slug
+
+MAX_GPUS = 8
+MIN_GPUS = 1
+DEFAULT_STEPS = 20_000
+DEFAULT_EFFECTIVE_BATCH_SIZE = 256
+DEFAULT_MICROBATCH_PER_GPU = 32
+DEFAULT_EVAL_BATCH_SIZE = 1
+DEFAULT_CPUS_PER_GPU = 8
+DEFAULT_MEMORY_PER_GPU_GB = 40
+
+
+@dataclass(frozen=True)
+class BenchmarkSpec:
+    name: str
+    dataset_repo_id: str
+    docker_image: str
+    eval_env_type: str
+    eval_task: str
+    eval_n_episodes: int
+    train_steps: int = DEFAULT_STEPS
+    effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
+    train_extra_args: dict[str, Any] = field(default_factory=dict)
+    eval_extra_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PolicySpec:
+    name: str
+    policy_type: str
+    num_gpus: int
+    policy_path: str | None = None
+    microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
+    extra_train_args: dict[str, Any] = field(default_factory=dict)
+    extra_eval_args: dict[str, Any] = field(default_factory=dict)
+    needs_tokenizer: bool = False
+    tokenizer_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PlannedJob:
+    benchmark: str
+    policy: str
+    run_rel: str
+    num_gpus: int
+    microbatch_per_gpu: int
+    gradient_accumulation_steps: int
+    effective_batch_size: int
+    docker_image: str
+    train_args: dict[str, Any]
+    eval_args: dict[str, Any]
+    tokenizer_args: dict[str, Any] | None
+    script_path: str
+
+
+BENCHMARKS: dict[str, BenchmarkSpec] = {
+    "libero_plus": BenchmarkSpec(
+        name="libero_plus",
+        dataset_repo_id="lerobot/libero_plus",
+        docker_image="lerobot-benchmark-libero-plus:latest",
+        eval_env_type="libero_plus",
+        eval_task="libero_spatial,libero_object,libero_goal,libero_10",
+        eval_n_episodes=10,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.image2": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.camera_name_mapping": {
+                "agentview_image": "camera1",
+                "robot0_eye_in_hand_image": "camera2",
+            },
+            "env.max_parallel_tasks": 1,
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+    "robomme": BenchmarkSpec(
+        name="robomme",
+        dataset_repo_id="lerobot/robomme",
+        docker_image="lerobot-benchmark-robomme:latest",
+        eval_env_type="robomme",
+        eval_task=(
+            "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
+            "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
+            "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
+        ),
+        eval_n_episodes=50,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.dataset_split": "test",
+            "env.max_parallel_tasks": 1,
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+}
+
+
+POLICIES: dict[str, PolicySpec] = {
+    "pi0": PolicySpec(
+        name="pi0",
+        policy_type="pi0",
+        policy_path="lerobot/pi0_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "pi0_fast": PolicySpec(
+        name="pi0_fast",
+        policy_type="pi0_fast",
+        policy_path="lerobot/pi0fast-base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+        needs_tokenizer=True,
+        tokenizer_args={
+            "action_horizon": 30,
+            "encoded_dims": "0:7",
+            "normalization_mode": "QUANTILES",
+            "vocab_size": 1024,
+            "scale": 10.0,
+            "push_to_hub": True,
+        },
+    ),
+    "pi05": PolicySpec(
+        name="pi05",
+        policy_type="pi05",
+        policy_path="lerobot/pi05_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "groot": PolicySpec(
+        name="groot",
+        policy_type="groot",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
+            "policy.tune_diffusion_model": True,
+            "policy.tune_projector": True,
+            "policy.tune_llm": False,
+            "policy.tune_visual": False,
+            "policy.use_bf16": True,
+        },
+    ),
+    "act": PolicySpec(
+        name="act",
+        policy_type="act",
+        num_gpus=1,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+        },
+    ),
+    "diffusion": PolicySpec(
+        name="diffusion",
+        policy_type="diffusion",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+            "policy.n_obs_steps": 2,
+        },
+    ),
+    "smolvla": PolicySpec(
+        name="smolvla",
+        policy_type="smolvla",
+        policy_path="lerobot/smolvla_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.load_vlm_weights": True,
+            "policy.freeze_vision_encoder": False,
+            "policy.train_expert_only": False,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "xvla": PolicySpec(
+        name="xvla",
+        policy_type="xvla",
+        policy_path="lerobot/xvla-widowx",
+        num_gpus=4,
+        extra_train_args={
+            "policy.n_action_steps": 32,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "multi_task_dit": PolicySpec(
+        name="multi_task_dit",
+        policy_type="multi_task_dit",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+        },
+    ),
+}
+
+
+def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
+    return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
+
+
+def get_requested_names(
+    requested: list[str] | None,
+    available: dict[str, Any],
+    *,
+    kind: str,
+) -> list[str]:
+    if not requested:
+        return list(available)
+    unknown = sorted(set(requested) - set(available))
+    if unknown:
+        raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
+    return requested
+
+
+def compute_gradient_accumulation_steps(
+    *,
+    effective_batch_size: int,
+    num_gpus: int,
+    microbatch_per_gpu: int,
+) -> int:
+    per_step_batch = num_gpus * microbatch_per_gpu
+    if effective_batch_size % per_step_batch != 0:
+        raise ValueError(
+            f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
+            f"{microbatch_per_gpu=}."
+        )
+    return effective_batch_size // per_step_batch
+
+
+def make_run_slug() -> str:
+    return utc_timestamp_slug()
+
+
+def shell_value(value: Any) -> str:
+    if isinstance(value, bool):
+        value = "true" if value else "false"
+    elif isinstance(value, (dict, list)):
+        value = json.dumps(value, sort_keys=True)
+    else:
+        value = str(value)
+    escaped = (
+        value.replace("\\", "\\\\")
+        .replace('"', '\\"')
+        .replace("$", "\\$")
+        .replace("`", "\\`")
+    )
+    return f'"{escaped}"'
+
+
+def format_cli_args(args: dict[str, Any]) -> str:
+    lines = []
+    for key, value in args.items():
+        lines.append(f"  --{key}={shell_value(value)}")
+    return " \\\n".join(lines)
+
+
+def build_train_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    train_dir: str,
+    gradient_accumulation_steps: int,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "dataset.repo_id": benchmark.dataset_repo_id,
+        "output_dir": train_dir,
+        "steps": benchmark.train_steps,
+        "batch_size": policy.microbatch_per_gpu,
+        "gradient_accumulation_steps": gradient_accumulation_steps,
+        "eval_freq": 0,
+        "save_freq": benchmark.train_steps,
+        "save_checkpoint": True,
+        "log_freq": 100,
+        "wandb.enable": False,
+        "policy.push_to_hub": False,
+        "policy.device": "cuda",
+    }
+    if policy.policy_path:
+        args["policy.path"] = policy.policy_path
+    else:
+        args["policy.type"] = policy.policy_type
+    args.update(benchmark.train_extra_args)
+    args.update(policy.extra_train_args)
+    return args
+
+
+def build_eval_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    checkpoint_path: str,
+    eval_dir: str,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "policy.path": checkpoint_path,
+        "env.type": benchmark.eval_env_type,
+        "env.task": benchmark.eval_task,
+        "eval.n_episodes": benchmark.eval_n_episodes,
+        "output_dir": eval_dir,
+    }
+    args.update(benchmark.eval_extra_args)
+    args.update(policy.extra_eval_args)
+    return args
+
+
+def plan_jobs(
+    *,
+    output_dir: Path,
+    hub_org: str,
+    results_repo: str,
+    policies: list[str],
+    benchmarks: list[str],
+) -> list[PlannedJob]:
+    _ = hub_org
+    _ = results_repo
+    scripts_dir = output_dir / "slurm"
+    jobs: list[PlannedJob] = []
+    for benchmark_name in benchmarks:
+        benchmark = BENCHMARKS[benchmark_name]
+        for policy_name in policies:
+            policy = POLICIES[policy_name]
+            num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
+            run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
+            run_root = f"/benchmark-output/{run_rel}"
+            gradient_accumulation_steps = compute_gradient_accumulation_steps(
+                effective_batch_size=benchmark.effective_batch_size,
+                num_gpus=num_gpus,
+                microbatch_per_gpu=policy.microbatch_per_gpu,
+            )
+            train_dir = f"{run_root}/train"
+            checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
+            eval_dir = f"{run_root}/eval"
+            train_args = build_train_args(
+                benchmark=benchmark,
+                policy=policy,
+                train_dir=train_dir,
+                gradient_accumulation_steps=gradient_accumulation_steps,
+            )
+            eval_args = build_eval_args(
+                benchmark=benchmark,
+                policy=policy,
+                checkpoint_path=checkpoint_path,
+                eval_dir=eval_dir,
+            )
+            tokenizer_args = None
+            if policy.needs_tokenizer:
+                tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
+                tokenizer_args = {
+                    "repo_id": benchmark.dataset_repo_id,
+                    "output_dir": f"{run_root}/tokenizer",
+                    "hub_repo_id": tokenizer_repo_id,
+                    **policy.tokenizer_args,
+                }
+                train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
+            script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
+            jobs.append(
+                PlannedJob(
+                    benchmark=benchmark_name,
+                    policy=policy_name,
+                    run_rel=run_rel,
+                    num_gpus=num_gpus,
+                    microbatch_per_gpu=policy.microbatch_per_gpu,
+                    gradient_accumulation_steps=gradient_accumulation_steps,
+                    effective_batch_size=benchmark.effective_batch_size,
+                    docker_image=benchmark.docker_image,
+                    train_args=train_args,
+                    eval_args=eval_args,
+                    tokenizer_args=tokenizer_args,
+                    script_path=script_path,
+                )
+            )
+    return jobs
+
+
+def render_sbatch_script(
+    *,
+    job: PlannedJob,
+    output_dir: Path,
+    results_repo_id: str,
+    git_commit: str,
+) -> str:
+    host_output_dir = output_dir.resolve()
+    run_root = f"/benchmark-output/{job.run_rel}"
+    host_run_root = host_output_dir / job.run_rel
+    cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
+    mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
+    gpu_ids_expr = "${GPU_IDS}"
+    train_cli = format_cli_args(job.train_args)
+    eval_cli = format_cli_args(job.eval_args)
+    tokenizer_command = ""
+    if job.tokenizer_args:
+        tokenizer_cli = format_cli_args(job.tokenizer_args)
+        tokenizer_command = f"""
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-train-tokenizer \\
+{tokenizer_cli}
+  '
+"""
+    return f"""#!/bin/bash
+#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
+#SBATCH --gres=gpu:{job.num_gpus}
+#SBATCH --cpus-per-task={cpus_per_task}
+#SBATCH --mem={mem_gb}G
+#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
+#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
+
+set -euo pipefail
+
+HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
+GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
+RUN_ROOT="{run_root}"
+
+mkdir -p "{host_output_dir}/logs"
+mkdir -p "{host_run_root.parent}"
+
+{tokenizer_command}
+
+TRAIN_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
+{train_cli}
+  '
+TRAIN_END="$(date +%s)"
+
+EVAL_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-eval \\
+{eval_cli}
+  '
+EVAL_END="$(date +%s)"
+TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
+EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
+
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -e RUN_ROOT="${{RUN_ROOT}}" \\
+  -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
+  -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    uv run python benchmarks/publish_benchmark_result.py \\
+      --benchmark={job.benchmark} \\
+      --policy={job.policy} \\
+      --run_root="${{RUN_ROOT}}" \\
+      --results_repo={results_repo_id} \\
+      --git_commit={git_commit} \\
+      --num_gpus={job.num_gpus} \\
+      --microbatch_per_gpu={job.microbatch_per_gpu} \\
+      --gradient_accumulation_steps={job.gradient_accumulation_steps} \\
+      --effective_batch_size={job.effective_batch_size} \\
+      --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
+      --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
+      --slurm_job_id="${{SLURM_JOB_ID:-}}" \\
+      --docker_image={job.docker_image}
+  '
+"""
+
+
+def write_manifest(
+    *,
+    output_dir: Path,
+    jobs: list[PlannedJob],
+    git_commit: str,
+    hub_org: str,
+    results_repo: str,
+) -> Path:
+    manifest = {
+        "generated_at": datetime.now(UTC).isoformat(),
+        "git_commit": git_commit,
+        "hub_org": hub_org,
+        "results_repo": results_repo,
+        "jobs": [asdict(job) for job in jobs],
+    }
+    manifest_path = output_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
+    return manifest_path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--policies", nargs="*", default=None)
+    parser.add_argument("--benchmarks", nargs="*", default=None)
+    parser.add_argument("--output_dir", required=True, type=Path)
+    parser.add_argument("--hub_org", required=True)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--submit", action="store_true")
+    return parser.parse_args()
+
+
+def get_git_commit() -> str:
+    return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
+
+
+def main() -> int:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
+
+    selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
+    selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
+    git_commit = get_git_commit()
+    results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
+
+    jobs = plan_jobs(
+        output_dir=args.output_dir,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+        policies=selected_policies,
+        benchmarks=selected_benchmarks,
+    )
+
+    for job in jobs:
+        script = render_sbatch_script(
+            job=job,
+            output_dir=args.output_dir,
+            results_repo_id=results_repo_id,
+            git_commit=git_commit,
+        )
+        script_path = Path(job.script_path)
+        script_path.write_text(script)
+        script_path.chmod(0o755)
+        if args.submit:
+            subprocess.run(["sbatch", str(script_path)], check=True)
+
+    manifest_path = write_manifest(
+        output_dir=args.output_dir,
+        jobs=jobs,
+        git_commit=git_commit,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+    )
+    print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
+    print(f"Manifest: {manifest_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
				`@@ -0,0 +1 @@`
				`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`