feat(benchmarks): add matrix runner and leaderboard

Merge branch 'main' into feat/libero-benchmark
feat(benchmarks): add LIBERO training benchmark pipeline
2026-05-11 22:59:50 +00:00 · 2026-04-15 21:31:33 +02:00 · 2026-04-14 10:43:49 +02:00 · 2026-04-09 17:01:49 +02:00
23 changed files with 2762 additions and 50 deletions
@@ -310,3 +310,181 @@ jobs:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn
+
+  # ── LIBERO-plus ───────────────────────────────────────────────────────────
+  libero-plus-integration-test:
+    name: LIBERO-plus — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Build LIBERO-plus benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.libero_plus
+          push: false
+          load: true
+          tags: lerobot-benchmark-libero-plus:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
+          cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
+
+      - name: Run LIBERO-plus smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name libero-plus-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-libero-plus:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_libero_plus \
+                --env.type=libero_plus \
+                --env.task=libero_spatial \
+                '--env.task_ids=[0,100,260,500,1000,1500,2000,2400]' \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero_plus --task libero_spatial \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy LIBERO-plus artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/libero-plus-artifacts
+          docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
+          docker rm -f libero-plus-eval || true
+
+      - name: Parse LIBERO-plus eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/libero-plus-artifacts \
+            --env libero_plus \
+            --task libero_spatial \
+            --policy lerobot/smolvla_libero_plus
+
+      - name: Upload LIBERO-plus rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-rollout-video
+          path: /tmp/libero-plus-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload LIBERO-plus eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-metrics
+          path: /tmp/libero-plus-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOMME ───────────────────────────────────────────────────────────────
+  robomme-integration-test:
+    name: RoboMME — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Build RoboMME benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robomme
+          push: false
+          load: true
+          tags: lerobot-benchmark-robomme:ci
+
+      - name: Run RoboMME smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robomme-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-robomme:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robomme \
+                --env.type=robomme \
+                --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+                --env.dataset_split=test \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=3 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robomme --task PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboMME artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robomme-artifacts
+          docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
+          docker rm -f robomme-eval || true
+
+      - name: Parse RoboMME eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robomme-artifacts \
+            --env robomme \
+            --task PickXtimes \
+            --policy lerobot/smolvla_robomme
+
+      - name: Upload RoboMME rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-rollout-video
+          path: /tmp/robomme-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboMME eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-metrics
+          path: /tmp/robomme-artifacts/metrics.json
+          if-no-files-found: warn
@@ -0,0 +1 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
@@ -0,0 +1,60 @@
+# LeRobot LIBERO Training Benchmark
+
+Train and evaluate all LeRobot policies on [LIBERO](https://libero-project.github.io/) and publish results as a HuggingFace leaderboard dataset.
+
+## Policies
+
+| Policy         | Base Model           | GPUs | LR     | Chunk | Notes                                 |
+| -------------- | -------------------- | ---- | ------ | ----- | ------------------------------------- |
+| pi0            | lerobot/pi0_base     | 8    | 2.5e-5 | 30    | PaliGemma + Gemma flow matching       |
+| pi0_fast       | lerobot/pi0fast-base | 8    | 2.5e-5 | 30    | Requires tokenizer pre-training       |
+| pi05           | lerobot/pi05_base    | 8    | 2.5e-5 | 30    | Quantiles normalization               |
+| groot          | nvidia/GR00T-N1.5-3B | 8    | 1e-4   | 30    | bf16, diffusion head + projector only |
+| act            | From scratch         | 1    | 1e-5   | 30    | ResNet-18, lightweight                |
+| diffusion      | From scratch         | 1    | 1e-4   | 32\*  | U-Net, horizon must be divisible by 8 |
+| smolvla        | lerobot/smolvla_base | 8    | 1e-4   | 30    | SmolVLM2-500M                         |
+| xvla           | lerobot/xvla-widowx  | 4    | 1e-4   | 32\*  | Florence2 + CLIP                      |
+| multi_task_dit | From scratch         | 1    | 2e-5   | 32\*  | CLIP + DiT                            |
+
+\* These policies use `horizon` rather than `chunk_size`. Set to 32 (nearest valid value to 30).
+
+## Training spec
+
+- **Steps**: 5,000 per policy
+- **Batch size**: 32 per GPU (effective BS = 256 for multi-GPU)
+- **Dataset**: `lerobot/libero` (libero_spatial)
+- **Evaluation**: 20 episodes after training
+- **LR**: each policy's default optimizer/scheduler preset
+- **Results**: each SLURM job publishes its own row to the HF leaderboard dataset automatically
+
+## Quick start
+
+### 1. Generate SLURM scripts
+
+```bash
+python benchmarks/libero/run_benchmark.py \
+    --output_dir /scratch/lerobot-benchmark \
+    --hub_org lerobot
+```
+
+### 2. Submit jobs
+
+```bash
+# If using pi0_fast, submit tokenizer first:
+sbatch /scratch/lerobot-benchmark/slurm_scripts/00_tokenizer.sh
+# Wait, then submit pi0_fast
+
+# All other policies can run in parallel:
+for script in /scratch/lerobot-benchmark/slurm_scripts/[0-9][0-9]_*.sh; do
+    [[ "$script" == *pi0_fast* ]] && continue
+    sbatch "$script"
+done
+```
+
+Each job publishes its result to `lerobot/benchmark-libero` on the Hub when it finishes.
+
+## Prerequisites
+
+- SLURM cluster with CUDA GPUs (A100 80GB recommended for VLM policies)
+- `pip install lerobot[pi,smolvla,groot,xvla,multi_task_dit,libero] datasets`
+- `huggingface-cli login`
@@ -0,0 +1,606 @@
+#!/usr/bin/env python
+"""Generate SLURM sbatch scripts for training all LeRobot policies on LIBERO.
+
+Each generated script trains one policy, evaluates it, and publishes its
+results row to a HuggingFace leaderboard dataset — no separate collection
+step needed.
+
+Usage:
+    # Generate scripts for all policies:
+    python benchmarks/libero/run_benchmark.py \\
+        --output_dir /scratch/lerobot-benchmark --hub_org lerobot
+
+    # Generate for a subset:
+    python benchmarks/libero/run_benchmark.py \\
+        --policies pi0 smolvla act \\
+        --output_dir /scratch/lerobot-benchmark --hub_org lerobot
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import textwrap
+import uuid
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+
+# ──────────────────────────────────────────────────────────────────────
+# Policy benchmark configs
+# ──────────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class PolicyBenchmarkConfig:
+    """Training configuration for a single policy on a benchmark."""
+
+    policy_type: str
+    policy_path: str | None = None
+    num_gpus: int = 1
+    chunk_size: int | None = None  # Set on policies that use chunk_size (not horizon)
+    extra_policy_args: dict[str, str] = field(default_factory=dict)
+    needs_tokenizer: bool = False
+    tokenizer_args: dict[str, str] = field(default_factory=dict)
+
+
+COMMON_TRAINING_ARGS: dict[str, str] = {
+    "dataset.repo_id": "lerobot/libero",
+    "dataset.use_imagenet_stats": "false",
+    "env.type": "libero",
+    "env.task": "libero_spatial",
+    "steps": "5000",
+    "batch_size": "32",
+    "eval_freq": "0",
+    "save_freq": "5000",
+    "save_checkpoint": "true",
+    "log_freq": "100",
+    "wandb.enable": "true",
+    "policy.push_to_hub": "true",
+    "rename_map": (
+        '{"observation.images.image":"observation.images.camera1",'
+        '"observation.images.image2":"observation.images.camera2"}'
+    ),
+}
+
+EVAL_ARGS: dict[str, str] = {
+    "env.type": "libero",
+    "env.task": "libero_spatial",
+    "eval.n_episodes": "20",
+    "eval.batch_size": "10",
+}
+
+POLICY_CONFIGS: dict[str, PolicyBenchmarkConfig] = {
+    "pi0": PolicyBenchmarkConfig(
+        policy_type="pi0",
+        policy_path="lerobot/pi0_base",
+        num_gpus=8,
+        chunk_size=30,
+        extra_policy_args={
+            "policy.n_action_steps": "30",
+            "policy.scheduler_decay_steps": "5000",
+        },
+    ),
+    "pi0_fast": PolicyBenchmarkConfig(
+        policy_type="pi0_fast",
+        policy_path="lerobot/pi0fast-base",
+        num_gpus=8,
+        chunk_size=30,
+        extra_policy_args={
+            "policy.n_action_steps": "30",
+            "policy.scheduler_decay_steps": "5000",
+        },
+        needs_tokenizer=True,
+        tokenizer_args={
+            "repo_id": "lerobot/libero",
+            "action_horizon": "30",
+            "encoded_dims": "0:7",
+            "normalization_mode": "QUANTILES",
+            "vocab_size": "1024",
+            "scale": "10.0",
+            "push_to_hub": "true",
+        },
+    ),
+    "pi05": PolicyBenchmarkConfig(
+        policy_type="pi05",
+        policy_path="lerobot/pi05_base",
+        num_gpus=8,
+        chunk_size=30,
+        extra_policy_args={
+            "policy.n_action_steps": "30",
+            "policy.scheduler_decay_steps": "5000",
+        },
+    ),
+    "groot": PolicyBenchmarkConfig(
+        policy_type="groot",
+        policy_path=None,
+        num_gpus=8,
+        chunk_size=30,
+        extra_policy_args={
+            "policy.n_action_steps": "30",
+            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
+            "policy.tune_diffusion_model": "true",
+            "policy.tune_projector": "true",
+            "policy.tune_llm": "false",
+            "policy.tune_visual": "false",
+            "policy.use_bf16": "true",
+        },
+    ),
+    "act": PolicyBenchmarkConfig(
+        policy_type="act",
+        policy_path=None,
+        num_gpus=1,
+        chunk_size=30,
+        extra_policy_args={"policy.n_action_steps": "30"},
+    ),
+    "diffusion": PolicyBenchmarkConfig(
+        policy_type="diffusion",
+        policy_path=None,
+        num_gpus=1,
+        chunk_size=None,
+        extra_policy_args={
+            "policy.horizon": "32",
+            "policy.n_action_steps": "30",
+            "policy.n_obs_steps": "2",
+        },
+    ),
+    "smolvla": PolicyBenchmarkConfig(
+        policy_type="smolvla",
+        policy_path="lerobot/smolvla_base",
+        num_gpus=8,
+        chunk_size=30,
+        extra_policy_args={
+            "policy.n_action_steps": "30",
+            "policy.load_vlm_weights": "true",
+            "policy.freeze_vision_encoder": "false",
+            "policy.train_expert_only": "false",
+            "policy.scheduler_decay_steps": "5000",
+        },
+    ),
+    "xvla": PolicyBenchmarkConfig(
+        policy_type="xvla",
+        policy_path="lerobot/xvla-widowx",
+        num_gpus=4,
+        chunk_size=32,
+        extra_policy_args={
+            "policy.n_action_steps": "32",
+            "policy.scheduler_decay_steps": "5000",
+        },
+    ),
+    "multi_task_dit": PolicyBenchmarkConfig(
+        policy_type="multi_task_dit",
+        policy_path=None,
+        num_gpus=1,
+        chunk_size=None,
+        extra_policy_args={
+            "policy.horizon": "32",
+            "policy.n_action_steps": "30",
+        },
+    ),
+}
+
+ALL_POLICY_NAMES = list(POLICY_CONFIGS.keys())
+
+# GPU memory estimates (GB) for SLURM --mem allocation
+GPU_MEM_ESTIMATES: dict[str, int] = {
+    "pi0": 320,
+    "pi0_fast": 320,
+    "pi05": 280,
+    "groot": 320,
+    "act": 64,
+    "diffusion": 64,
+    "smolvla": 160,
+    "xvla": 160,
+    "multi_task_dit": 64,
+}
+
+
+# ──────────────────────────────────────────────────────────────────────
+# SLURM script generation
+# ──────────────────────────────────────────────────────────────────────
+
+
+def _cli_args(args: dict[str, str]) -> str:
+    """Build a backslash-continued CLI arg string with proper shell quoting."""
+    lines = []
+    for key, value in args.items():
+        if any(c in str(value) for c in ["{", "}", " ", '"', "'"]):
+            lines.append(f"    --{key}='{value}'")
+        else:
+            lines.append(f"    --{key}={value}")
+    return " \\\n".join(lines)
+
+
+def _training_cli_args(
+    policy_name: str,
+    output_dir: Path,
+    hub_org: str,
+    benchmark_uuid: str,
+) -> str:
+    cfg = POLICY_CONFIGS[policy_name]
+    args: dict[str, str] = {}
+    args.update(COMMON_TRAINING_ARGS)
+    args["policy.type"] = cfg.policy_type
+    if cfg.policy_path:
+        args["policy.path"] = cfg.policy_path
+    if cfg.chunk_size is not None:
+        args["policy.chunk_size"] = str(cfg.chunk_size)
+    args.update(cfg.extra_policy_args)
+    args["output_dir"] = str(output_dir / "train" / policy_name)
+    args["policy.repo_id"] = f"{hub_org}/{policy_name}_libero"
+    args["wandb.project"] = "lerobot-libero-benchmark"
+    args["wandb.run_name"] = f"{policy_name}_{benchmark_uuid[:8]}"
+    return _cli_args(args)
+
+
+def _publish_snippet(
+    policy_name: str,
+    output_dir: Path,
+    hub_org: str,
+    benchmark_uuid: str,
+    hub_dataset: str,
+) -> str:
+    """Inline Python that each SLURM job runs to publish its own result row."""
+    cfg = POLICY_CONFIGS[policy_name]
+    steps = int(COMMON_TRAINING_ARGS["steps"])
+    bs = int(COMMON_TRAINING_ARGS["batch_size"])
+    eff_bs = bs * cfg.num_gpus
+    train_dir = output_dir / "train" / policy_name
+
+    return textwrap.dedent(f"""\
+        python3 -c "
+        import json, os, re, sys
+        from pathlib import Path
+        from datetime import datetime, timezone
+
+        timing = {{}}
+        tp = Path('{output_dir}/logs/{policy_name}_timing.txt')
+        if tp.exists():
+            for ln in tp.read_text().splitlines():
+                if '=' in ln:
+                    k, _, v = ln.partition('=')
+                    timing[k.strip()] = v.strip()
+
+        # Parse eval results
+        eval_sr, eval_per_task, eval_n = None, '{{}}', 0
+        eval_dir = Path('{train_dir}/eval_results')
+        if eval_dir.exists():
+            for jf in eval_dir.glob('**/*.json'):
+                try:
+                    d = json.loads(jf.read_text())
+                except Exception:
+                    continue
+                if 'avg_success_rate' in d:
+                    eval_sr = d['avg_success_rate']
+                elif 'eval_info' in d and 'avg_success_rate' in d.get('eval_info', {{}}):
+                    eval_sr = d['eval_info']['avg_success_rate']
+                pt = {{k: v for k, v in d.items() if 'success_rate' in k and k != 'avg_success_rate'}}
+                if pt:
+                    eval_per_task = json.dumps(pt)
+                if 'n_episodes' in d:
+                    eval_n = d['n_episodes']
+
+        # Parse final loss from SLURM stdout
+        final_loss = None
+        for lf in sorted(Path('{output_dir}/logs').glob('{policy_name}_*.out'), reverse=True):
+            losses = re.findall(r'\\\"loss\\\"\\s*:\\s*([\\d.e+-]+)', lf.read_text())
+            if losses:
+                final_loss = float(losses[-1])
+                break
+
+        # Parse peak GPU mem
+        peak_mem = 0.0
+        csv_p = Path('{output_dir}/logs/{policy_name}_gpu_mem.csv')
+        if csv_p.exists():
+            for ln in csv_p.read_text().splitlines():
+                parts = ln.strip().split(',')
+                if len(parts) >= 2:
+                    try:
+                        peak_mem = max(peak_mem, float(parts[1].strip()))
+                    except ValueError:
+                        pass
+
+        # Parse train config for optimizer details
+        lr, opt_wd, sched_type, sched_warmup, sched_decay = 0.0, 0.0, '', 0, 0
+        freeze_ve, train_eo, grad_ckpt = False, False, False
+        cfg_path = Path('{train_dir}/checkpoints/{steps:06d}/pretrained_model/train_config.json')
+        if cfg_path.exists():
+            tc = json.loads(cfg_path.read_text())
+            o = tc.get('optimizer', {{}})
+            lr = o.get('lr', 0.0)
+            opt_wd = o.get('weight_decay', 0.0)
+            s = tc.get('scheduler', {{}})
+            sched_type = s.get('type', '')
+            sched_warmup = s.get('num_warmup_steps', 0)
+            sched_decay = s.get('num_decay_steps', 0)
+            p = tc.get('policy', {{}})
+            freeze_ve = p.get('freeze_vision_encoder', False)
+            train_eo = p.get('train_expert_only', False)
+            grad_ckpt = p.get('gradient_checkpointing', False)
+
+        row = {{
+            'benchmark_uuid': '{benchmark_uuid}',
+            'policy_type': '{policy_name}',
+            'policy_repo_id': '{hub_org}/{policy_name}_libero',
+            'base_model_repo_id': '{cfg.policy_path or ""}',
+            'dataset_repo_id': '{COMMON_TRAINING_ARGS["dataset.repo_id"]}',
+            'env_type': '{COMMON_TRAINING_ARGS["env.type"]}',
+            'env_task': '{COMMON_TRAINING_ARGS["env.task"]}',
+            'steps': {steps},
+            'batch_size_per_gpu': {bs},
+            'num_gpus': {cfg.num_gpus},
+            'effective_batch_size': {eff_bs},
+            'total_samples_seen': {steps * eff_bs},
+            'chunk_size': {cfg.chunk_size or 0},
+            'learning_rate': lr,
+            'optimizer_type': 'AdamW',
+            'optimizer_weight_decay': opt_wd,
+            'scheduler_type': sched_type,
+            'scheduler_warmup_steps': sched_warmup,
+            'scheduler_decay_steps': sched_decay,
+            'freeze_vision_encoder': freeze_ve,
+            'train_expert_only': train_eo,
+            'gradient_checkpointing': grad_ckpt,
+            'eval_success_rate': eval_sr,
+            'eval_success_rate_per_task': eval_per_task,
+            'eval_n_episodes': eval_n,
+            'final_train_loss': final_loss,
+            'training_time_s': float(timing.get('TRAINING_TIME_S', 0)),
+            'peak_gpu_memory_mb': peak_mem or float(timing.get('MAX_GPU_MEM_MB', 0)),
+            'gpu_type': timing.get('GPU_TYPE', 'unknown'),
+            'lerobot_commit': timing.get('LEROBOT_COMMIT', 'unknown'),
+            'timestamp': datetime.now(timezone.utc).isoformat(),
+        }}
+
+        # Save locally
+        Path('{train_dir}/benchmark_result.json').write_text(json.dumps(row, indent=2, default=str))
+
+        # Push to HF dataset
+        try:
+            from datasets import Dataset, load_dataset
+            try:
+                existing = load_dataset('{hub_dataset}', split='train')
+                rows = existing.to_list() + [row]
+            except Exception:
+                rows = [row]
+            Dataset.from_list(rows).push_to_hub('{hub_dataset}', split='train')
+            print('Published result to {hub_dataset}')
+        except ImportError:
+            print('datasets library not installed — result saved locally only')
+        except Exception as e:
+            print(f'Failed to push to hub: {{e}} — result saved locally')
+        "
+    """)
+
+
+def _generate_sbatch_script(
+    policy_name: str,
+    output_dir: Path,
+    hub_org: str,
+    benchmark_uuid: str,
+    hub_dataset: str,
+    lerobot_commit: str,
+) -> str:
+    cfg = POLICY_CONFIGS[policy_name]
+    steps = int(COMMON_TRAINING_ARGS["steps"])
+    log_dir = output_dir / "logs"
+    train_dir = output_dir / "train" / policy_name
+    checkpoint_path = train_dir / f"checkpoints/{steps:06d}/pretrained_model"
+
+    training_args = _training_cli_args(policy_name, output_dir, hub_org, benchmark_uuid)
+    eval_args = _cli_args(EVAL_ARGS)
+    publish = _publish_snippet(policy_name, output_dir, hub_org, benchmark_uuid, hub_dataset)
+
+    return textwrap.dedent(f"""\
+        #!/bin/bash
+        #SBATCH --job-name=bench_{policy_name}
+        #SBATCH --nodes=1
+        #SBATCH --ntasks-per-node=1
+        #SBATCH --gres=gpu:{cfg.num_gpus}
+        #SBATCH --cpus-per-task={cfg.num_gpus * 8}
+        #SBATCH --mem={GPU_MEM_ESTIMATES.get(policy_name, 128)}G
+        #SBATCH --time=06:00:00
+        #SBATCH --output={log_dir}/{policy_name}_%j.out
+        #SBATCH --error={log_dir}/{policy_name}_%j.err
+
+        set -euo pipefail
+
+        echo "=========================================="
+        echo "LeRobot LIBERO Benchmark — {policy_name}"
+        echo "UUID: {benchmark_uuid}"
+        echo "Start: $(date -Iseconds)"
+        echo "Host: $(hostname) | GPUs: {cfg.num_gpus}"
+        echo "=========================================="
+
+        START_TIME=$(date +%s)
+
+        # GPU memory monitoring (every 30s)
+        nvidia-smi --query-gpu=index,memory.used,memory.total,gpu_name \\
+            --format=csv,noheader,nounits -l 30 \\
+            > "{log_dir}/{policy_name}_gpu_mem.csv" &
+        GPU_MONITOR_PID=$!
+
+        # ── Training ──────────────────────────────────────────────────
+        echo "[$(date -Iseconds)] Starting training..."
+        accelerate launch --num_processes={cfg.num_gpus} \\
+            $(which lerobot-train) \\
+        {training_args}
+        TRAIN_EXIT=$?
+        TRAIN_END=$(date +%s)
+        echo "[$(date -Iseconds)] Training exit code: $TRAIN_EXIT"
+
+        # ── Evaluation ────────────────────────────────────────────────
+        EVAL_EXIT=1
+        if [ $TRAIN_EXIT -eq 0 ]; then
+            echo "[$(date -Iseconds)] Starting evaluation..."
+            lerobot-eval \\
+                --policy.path="{checkpoint_path}" \\
+            {eval_args} \\
+                --output_dir="{train_dir}/eval_results"
+            EVAL_EXIT=$?
+            echo "[$(date -Iseconds)] Eval exit code: $EVAL_EXIT"
+        else
+            echo "[$(date -Iseconds)] Skipping eval — training failed."
+        fi
+
+        # ── Timing ────────────────────────────────────────────────────
+        END_TIME=$(date +%s)
+        kill $GPU_MONITOR_PID 2>/dev/null || true
+
+        cat > "{log_dir}/{policy_name}_timing.txt" <<TIMING_EOF
+        BENCHMARK_UUID={benchmark_uuid}
+        POLICY_TYPE={policy_name}
+        TRAINING_TIME_S=$((TRAIN_END - START_TIME))
+        TOTAL_TIME_S=$((END_TIME - START_TIME))
+        TRAIN_EXIT=$TRAIN_EXIT
+        EVAL_EXIT=$EVAL_EXIT
+        MAX_GPU_MEM_MB=$(awk -F',' '{{print $2}}' "{log_dir}/{policy_name}_gpu_mem.csv" 2>/dev/null | sort -n | tail -1)
+        GPU_TYPE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | head -1 | xargs)
+        LEROBOT_COMMIT={lerobot_commit}
+        TIMING_EOF
+
+        # ── Publish result to HF dataset ──────────────────────────────
+        echo "[$(date -Iseconds)] Publishing result..."
+        {publish}
+
+        echo "=========================================="
+        echo "Done: $(date -Iseconds)"
+        echo "Training: $((TRAIN_END - START_TIME))s | Total: $((END_TIME - START_TIME))s"
+        echo "=========================================="
+    """)
+
+
+def _generate_tokenizer_script(
+    output_dir: Path,
+    hub_org: str,
+    benchmark_uuid: str,
+) -> str:
+    cfg = POLICY_CONFIGS["pi0_fast"]
+    log_dir = output_dir / "logs"
+    tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
+
+    tok_args = dict(cfg.tokenizer_args)
+    tok_args["hub_repo_id"] = tokenizer_hub_repo
+
+    return textwrap.dedent(f"""\
+        #!/bin/bash
+        #SBATCH --job-name=bench_tokenizer
+        #SBATCH --nodes=1
+        #SBATCH --ntasks-per-node=1
+        #SBATCH --gres=gpu:1
+        #SBATCH --cpus-per-task=8
+        #SBATCH --mem=64G
+        #SBATCH --time=01:00:00
+        #SBATCH --output={log_dir}/tokenizer_%j.out
+        #SBATCH --error={log_dir}/tokenizer_%j.err
+
+        set -euo pipefail
+        echo "LeRobot — FAST Tokenizer | UUID: {benchmark_uuid}"
+
+        lerobot-train-tokenizer \\
+        {_cli_args(tok_args)}
+
+        echo "Tokenizer pushed to: {tokenizer_hub_repo}"
+    """)
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate SLURM scripts for LeRobot LIBERO benchmark.")
+    parser.add_argument(
+        "--policies",
+        nargs="+",
+        default=ALL_POLICY_NAMES,
+        choices=ALL_POLICY_NAMES,
+        help="Policies to benchmark (default: all).",
+    )
+    parser.add_argument("--output_dir", type=Path, required=True, help="Root output directory.")
+    parser.add_argument("--hub_org", type=str, default="lerobot", help="HuggingFace org.")
+    parser.add_argument("--hub_dataset", type=str, default=None, help="HF dataset repo for results.")
+    parser.add_argument("--uuid", type=str, default=None, help="Override benchmark UUID.")
+    args = parser.parse_args()
+
+    benchmark_uuid = args.uuid or str(uuid.uuid4())
+    output_dir: Path = args.output_dir.resolve()
+    policies: list[str] = args.policies
+    hub_org: str = args.hub_org
+    hub_dataset: str = args.hub_dataset or f"{hub_org}/benchmark-libero"
+
+    try:
+        commit = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        commit = "unknown"
+
+    scripts_dir = output_dir / "slurm_scripts"
+    log_dir = output_dir / "logs"
+    scripts_dir.mkdir(parents=True, exist_ok=True)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    for p in policies:
+        (output_dir / "train" / p).mkdir(parents=True, exist_ok=True)
+
+    generated: dict[str, Path] = {}
+
+    # Tokenizer job for pi0_fast
+    tokenizer_path = None
+    if "pi0_fast" in policies:
+        script = _generate_tokenizer_script(output_dir, hub_org, benchmark_uuid)
+        tokenizer_path = scripts_dir / "00_tokenizer.sh"
+        tokenizer_path.write_text(script)
+        tokenizer_path.chmod(0o755)
+        generated["tokenizer"] = tokenizer_path
+        tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
+        POLICY_CONFIGS["pi0_fast"].extra_policy_args["policy.action_tokenizer_name"] = tokenizer_hub_repo
+
+    # Per-policy scripts
+    for i, name in enumerate(sorted(policies), start=1):
+        script = _generate_sbatch_script(name, output_dir, hub_org, benchmark_uuid, hub_dataset, commit)
+        path = scripts_dir / f"{i:02d}_{name}.sh"
+        path.write_text(script)
+        path.chmod(0o755)
+        generated[name] = path
+
+    # Manifest
+    manifest = {
+        "benchmark_uuid": benchmark_uuid,
+        "timestamp": datetime.now(UTC).isoformat(),
+        "lerobot_commit": commit,
+        "hub_org": hub_org,
+        "hub_dataset": hub_dataset,
+        "policies": policies,
+        "output_dir": str(output_dir),
+        "scripts": {k: str(v) for k, v in generated.items()},
+    }
+    manifest_path = output_dir / "benchmark_manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2))
+
+    # Instructions
+    print("=" * 60)
+    print("LeRobot LIBERO Benchmark — Scripts Generated")
+    print(f"UUID: {benchmark_uuid}")
+    print(f"Output: {output_dir}")
+    print(f"Results dataset: {hub_dataset}")
+    print("=" * 60)
+    print()
+    for _name, path in sorted(generated.items()):
+        print(f"  {path}")
+    print()
+
+    if tokenizer_path:
+        print("IMPORTANT: pi0_fast requires tokenizer training FIRST.")
+        print(f"  1. sbatch {tokenizer_path}")
+        print("  2. Wait for completion")
+        print(f"  3. sbatch {generated.get('pi0_fast', 'N/A')}")
+        print("  4. All other policies can run in parallel")
+    else:
+        print("All scripts can be submitted in parallel.")
+    print()
+    print("Each job publishes its result to the HF dataset automatically.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
+
+
+def load_json_if_exists(path: Path) -> dict[str, Any] | None:
+    if not path.exists():
+        return None
+    return json.loads(path.read_text())
+
+
+def find_latest_train_config_path(run_root: Path) -> Path | None:
+    checkpoints_dir = run_root / "train" / "checkpoints"
+    if not checkpoints_dir.exists():
+        return None
+    candidates = sorted(
+        checkpoints_dir.glob("*/pretrained_model/train_config.json"),
+        key=lambda path: path.parts[-3],
+    )
+    return candidates[-1] if candidates else None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--benchmark", required=True)
+    parser.add_argument("--policy", required=True)
+    parser.add_argument("--run_root", required=True, type=Path)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--git_commit", required=True)
+    parser.add_argument("--num_gpus", required=True, type=int)
+    parser.add_argument("--microbatch_per_gpu", required=True, type=int)
+    parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
+    parser.add_argument("--effective_batch_size", required=True, type=int)
+    parser.add_argument("--train_wall_time_s", required=True, type=float)
+    parser.add_argument("--eval_wall_time_s", required=True, type=float)
+    parser.add_argument("--slurm_job_id", default="")
+    parser.add_argument("--docker_image", required=True)
+    return parser.parse_args()
+
+
+def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
+    now = datetime.now(UTC)
+    created_at = now.isoformat()
+    timestamp = utc_timestamp_slug(now)
+    run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
+    eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
+    train_config_path = find_latest_train_config_path(args.run_root)
+    train_config = load_json_if_exists(train_config_path) or {}
+
+    artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
+    row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
+
+    row = {
+        "schema_version": 1,
+        "created_at": created_at,
+        "run_id": run_id,
+        "benchmark": args.benchmark,
+        "policy": args.policy,
+        "git_commit": args.git_commit,
+        "slurm_job_id": args.slurm_job_id or None,
+        "docker_image": args.docker_image,
+        "resources": {
+            "num_gpus": args.num_gpus,
+            "microbatch_per_gpu": args.microbatch_per_gpu,
+            "gradient_accumulation_steps": args.gradient_accumulation_steps,
+            "effective_batch_size": args.effective_batch_size,
+        },
+        "timings": {
+            "train_wall_time_s": args.train_wall_time_s,
+            "eval_wall_time_s": args.eval_wall_time_s,
+            "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
+        },
+        "eval": {
+            "overall": eval_info.get("overall", {}),
+            "per_group": eval_info.get("per_group", {}),
+            "per_task_count": len(eval_info.get("per_task", [])),
+        },
+        "paths": {
+            "run_root": str(args.run_root),
+            "train_dir": str(args.run_root / "train"),
+            "eval_dir": str(args.run_root / "eval"),
+        },
+        "train_config": train_config,
+        "artifact_urls": {
+            "row": make_hub_file_url(args.results_repo, row_path_in_repo),
+        },
+    }
+
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.parent.mkdir(parents=True, exist_ok=True)
+    upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
+
+    eval_info_path = args.run_root / "eval" / "eval_info.json"
+    if eval_info_path.exists():
+        row["artifact_urls"]["eval_info"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/eval_info.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
+        )
+
+    if train_config_path is not None and train_config_path.exists():
+        row["artifact_urls"]["train_config"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/train_config.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
+        )
+
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    return row, upload_list
+
+
+def main() -> int:
+    args = parse_args()
+    row, upload_list = build_row(args)
+    uploaded = upload_targets(
+        repo_id=args.results_repo,
+        targets=upload_list,
+        repo_type="dataset",
+        private=False,
+        commit_message=f"Add benchmark row {row['run_id']}",
+    )
+    row["uploaded_paths"] = uploaded
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    print(json.dumps(row, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import subprocess
+from dataclasses import asdict, dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import utc_timestamp_slug
+
+MAX_GPUS = 8
+MIN_GPUS = 1
+DEFAULT_STEPS = 20_000
+DEFAULT_EFFECTIVE_BATCH_SIZE = 256
+DEFAULT_MICROBATCH_PER_GPU = 32
+DEFAULT_EVAL_BATCH_SIZE = 1
+DEFAULT_CPUS_PER_GPU = 8
+DEFAULT_MEMORY_PER_GPU_GB = 40
+
+
+@dataclass(frozen=True)
+class BenchmarkSpec:
+    name: str
+    dataset_repo_id: str
+    docker_image: str
+    eval_env_type: str
+    eval_task: str
+    eval_n_episodes: int
+    train_steps: int = DEFAULT_STEPS
+    effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
+    train_extra_args: dict[str, Any] = field(default_factory=dict)
+    eval_extra_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PolicySpec:
+    name: str
+    policy_type: str
+    num_gpus: int
+    policy_path: str | None = None
+    microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
+    extra_train_args: dict[str, Any] = field(default_factory=dict)
+    extra_eval_args: dict[str, Any] = field(default_factory=dict)
+    needs_tokenizer: bool = False
+    tokenizer_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PlannedJob:
+    benchmark: str
+    policy: str
+    run_rel: str
+    num_gpus: int
+    microbatch_per_gpu: int
+    gradient_accumulation_steps: int
+    effective_batch_size: int
+    docker_image: str
+    train_args: dict[str, Any]
+    eval_args: dict[str, Any]
+    tokenizer_args: dict[str, Any] | None
+    script_path: str
+
+
+BENCHMARKS: dict[str, BenchmarkSpec] = {
+    "libero_plus": BenchmarkSpec(
+        name="libero_plus",
+        dataset_repo_id="lerobot/libero_plus",
+        docker_image="lerobot-benchmark-libero-plus:latest",
+        eval_env_type="libero_plus",
+        eval_task="libero_spatial,libero_object,libero_goal,libero_10",
+        eval_n_episodes=10,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.image2": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.camera_name_mapping": {
+                "agentview_image": "camera1",
+                "robot0_eye_in_hand_image": "camera2",
+            },
+            "env.max_parallel_tasks": 1,
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+    "robomme": BenchmarkSpec(
+        name="robomme",
+        dataset_repo_id="lerobot/robomme",
+        docker_image="lerobot-benchmark-robomme:latest",
+        eval_env_type="robomme",
+        eval_task=(
+            "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
+            "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
+            "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
+        ),
+        eval_n_episodes=50,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.dataset_split": "test",
+            "env.max_parallel_tasks": 1,
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+}
+
+
+POLICIES: dict[str, PolicySpec] = {
+    "pi0": PolicySpec(
+        name="pi0",
+        policy_type="pi0",
+        policy_path="lerobot/pi0_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "pi0_fast": PolicySpec(
+        name="pi0_fast",
+        policy_type="pi0_fast",
+        policy_path="lerobot/pi0fast-base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+        needs_tokenizer=True,
+        tokenizer_args={
+            "action_horizon": 30,
+            "encoded_dims": "0:7",
+            "normalization_mode": "QUANTILES",
+            "vocab_size": 1024,
+            "scale": 10.0,
+            "push_to_hub": True,
+        },
+    ),
+    "pi05": PolicySpec(
+        name="pi05",
+        policy_type="pi05",
+        policy_path="lerobot/pi05_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "groot": PolicySpec(
+        name="groot",
+        policy_type="groot",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
+            "policy.tune_diffusion_model": True,
+            "policy.tune_projector": True,
+            "policy.tune_llm": False,
+            "policy.tune_visual": False,
+            "policy.use_bf16": True,
+        },
+    ),
+    "act": PolicySpec(
+        name="act",
+        policy_type="act",
+        num_gpus=1,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+        },
+    ),
+    "diffusion": PolicySpec(
+        name="diffusion",
+        policy_type="diffusion",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+            "policy.n_obs_steps": 2,
+        },
+    ),
+    "smolvla": PolicySpec(
+        name="smolvla",
+        policy_type="smolvla",
+        policy_path="lerobot/smolvla_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.load_vlm_weights": True,
+            "policy.freeze_vision_encoder": False,
+            "policy.train_expert_only": False,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "xvla": PolicySpec(
+        name="xvla",
+        policy_type="xvla",
+        policy_path="lerobot/xvla-widowx",
+        num_gpus=4,
+        extra_train_args={
+            "policy.n_action_steps": 32,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "multi_task_dit": PolicySpec(
+        name="multi_task_dit",
+        policy_type="multi_task_dit",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+        },
+    ),
+}
+
+
+def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
+    return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
+
+
+def get_requested_names(
+    requested: list[str] | None,
+    available: dict[str, Any],
+    *,
+    kind: str,
+) -> list[str]:
+    if not requested:
+        return list(available)
+    unknown = sorted(set(requested) - set(available))
+    if unknown:
+        raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
+    return requested
+
+
+def compute_gradient_accumulation_steps(
+    *,
+    effective_batch_size: int,
+    num_gpus: int,
+    microbatch_per_gpu: int,
+) -> int:
+    per_step_batch = num_gpus * microbatch_per_gpu
+    if effective_batch_size % per_step_batch != 0:
+        raise ValueError(
+            f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
+            f"{microbatch_per_gpu=}."
+        )
+    return effective_batch_size // per_step_batch
+
+
+def make_run_slug() -> str:
+    return utc_timestamp_slug()
+
+
+def shell_value(value: Any) -> str:
+    if isinstance(value, bool):
+        value = "true" if value else "false"
+    elif isinstance(value, (dict, list)):
+        value = json.dumps(value, sort_keys=True)
+    else:
+        value = str(value)
+    escaped = (
+        value.replace("\\", "\\\\")
+        .replace('"', '\\"')
+        .replace("$", "\\$")
+        .replace("`", "\\`")
+    )
+    return f'"{escaped}"'
+
+
+def format_cli_args(args: dict[str, Any]) -> str:
+    lines = []
+    for key, value in args.items():
+        lines.append(f"  --{key}={shell_value(value)}")
+    return " \\\n".join(lines)
+
+
+def build_train_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    train_dir: str,
+    gradient_accumulation_steps: int,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "dataset.repo_id": benchmark.dataset_repo_id,
+        "output_dir": train_dir,
+        "steps": benchmark.train_steps,
+        "batch_size": policy.microbatch_per_gpu,
+        "gradient_accumulation_steps": gradient_accumulation_steps,
+        "eval_freq": 0,
+        "save_freq": benchmark.train_steps,
+        "save_checkpoint": True,
+        "log_freq": 100,
+        "wandb.enable": False,
+        "policy.push_to_hub": False,
+        "policy.device": "cuda",
+    }
+    if policy.policy_path:
+        args["policy.path"] = policy.policy_path
+    else:
+        args["policy.type"] = policy.policy_type
+    args.update(benchmark.train_extra_args)
+    args.update(policy.extra_train_args)
+    return args
+
+
+def build_eval_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    checkpoint_path: str,
+    eval_dir: str,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "policy.path": checkpoint_path,
+        "env.type": benchmark.eval_env_type,
+        "env.task": benchmark.eval_task,
+        "eval.n_episodes": benchmark.eval_n_episodes,
+        "output_dir": eval_dir,
+    }
+    args.update(benchmark.eval_extra_args)
+    args.update(policy.extra_eval_args)
+    return args
+
+
+def plan_jobs(
+    *,
+    output_dir: Path,
+    hub_org: str,
+    results_repo: str,
+    policies: list[str],
+    benchmarks: list[str],
+) -> list[PlannedJob]:
+    _ = hub_org
+    _ = results_repo
+    scripts_dir = output_dir / "slurm"
+    jobs: list[PlannedJob] = []
+    for benchmark_name in benchmarks:
+        benchmark = BENCHMARKS[benchmark_name]
+        for policy_name in policies:
+            policy = POLICIES[policy_name]
+            num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
+            run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
+            run_root = f"/benchmark-output/{run_rel}"
+            gradient_accumulation_steps = compute_gradient_accumulation_steps(
+                effective_batch_size=benchmark.effective_batch_size,
+                num_gpus=num_gpus,
+                microbatch_per_gpu=policy.microbatch_per_gpu,
+            )
+            train_dir = f"{run_root}/train"
+            checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
+            eval_dir = f"{run_root}/eval"
+            train_args = build_train_args(
+                benchmark=benchmark,
+                policy=policy,
+                train_dir=train_dir,
+                gradient_accumulation_steps=gradient_accumulation_steps,
+            )
+            eval_args = build_eval_args(
+                benchmark=benchmark,
+                policy=policy,
+                checkpoint_path=checkpoint_path,
+                eval_dir=eval_dir,
+            )
+            tokenizer_args = None
+            if policy.needs_tokenizer:
+                tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
+                tokenizer_args = {
+                    "repo_id": benchmark.dataset_repo_id,
+                    "output_dir": f"{run_root}/tokenizer",
+                    "hub_repo_id": tokenizer_repo_id,
+                    **policy.tokenizer_args,
+                }
+                train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
+            script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
+            jobs.append(
+                PlannedJob(
+                    benchmark=benchmark_name,
+                    policy=policy_name,
+                    run_rel=run_rel,
+                    num_gpus=num_gpus,
+                    microbatch_per_gpu=policy.microbatch_per_gpu,
+                    gradient_accumulation_steps=gradient_accumulation_steps,
+                    effective_batch_size=benchmark.effective_batch_size,
+                    docker_image=benchmark.docker_image,
+                    train_args=train_args,
+                    eval_args=eval_args,
+                    tokenizer_args=tokenizer_args,
+                    script_path=script_path,
+                )
+            )
+    return jobs
+
+
+def render_sbatch_script(
+    *,
+    job: PlannedJob,
+    output_dir: Path,
+    results_repo_id: str,
+    git_commit: str,
+) -> str:
+    host_output_dir = output_dir.resolve()
+    run_root = f"/benchmark-output/{job.run_rel}"
+    host_run_root = host_output_dir / job.run_rel
+    cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
+    mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
+    gpu_ids_expr = "${GPU_IDS}"
+    train_cli = format_cli_args(job.train_args)
+    eval_cli = format_cli_args(job.eval_args)
+    tokenizer_command = ""
+    if job.tokenizer_args:
+        tokenizer_cli = format_cli_args(job.tokenizer_args)
+        tokenizer_command = f"""
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-train-tokenizer \\
+{tokenizer_cli}
+  '
+"""
+    return f"""#!/bin/bash
+#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
+#SBATCH --gres=gpu:{job.num_gpus}
+#SBATCH --cpus-per-task={cpus_per_task}
+#SBATCH --mem={mem_gb}G
+#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
+#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
+
+set -euo pipefail
+
+HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
+GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
+RUN_ROOT="{run_root}"
+
+mkdir -p "{host_output_dir}/logs"
+mkdir -p "{host_run_root.parent}"
+
+{tokenizer_command}
+
+TRAIN_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
+{train_cli}
+  '
+TRAIN_END="$(date +%s)"
+
+EVAL_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-eval \\
+{eval_cli}
+  '
+EVAL_END="$(date +%s)"
+TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
+EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
+
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -e RUN_ROOT="${{RUN_ROOT}}" \\
+  -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
+  -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    uv run python benchmarks/publish_benchmark_result.py \\
+      --benchmark={job.benchmark} \\
+      --policy={job.policy} \\
+      --run_root="${{RUN_ROOT}}" \\
+      --results_repo={results_repo_id} \\
+      --git_commit={git_commit} \\
+      --num_gpus={job.num_gpus} \\
+      --microbatch_per_gpu={job.microbatch_per_gpu} \\
+      --gradient_accumulation_steps={job.gradient_accumulation_steps} \\
+      --effective_batch_size={job.effective_batch_size} \\
+      --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
+      --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
+      --slurm_job_id="${{SLURM_JOB_ID:-}}" \\
+      --docker_image={job.docker_image}
+  '
+"""
+
+
+def write_manifest(
+    *,
+    output_dir: Path,
+    jobs: list[PlannedJob],
+    git_commit: str,
+    hub_org: str,
+    results_repo: str,
+) -> Path:
+    manifest = {
+        "generated_at": datetime.now(UTC).isoformat(),
+        "git_commit": git_commit,
+        "hub_org": hub_org,
+        "results_repo": results_repo,
+        "jobs": [asdict(job) for job in jobs],
+    }
+    manifest_path = output_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
+    return manifest_path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--policies", nargs="*", default=None)
+    parser.add_argument("--benchmarks", nargs="*", default=None)
+    parser.add_argument("--output_dir", required=True, type=Path)
+    parser.add_argument("--hub_org", required=True)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--submit", action="store_true")
+    return parser.parse_args()
+
+
+def get_git_commit() -> str:
+    return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
+
+
+def main() -> int:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
+
+    selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
+    selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
+    git_commit = get_git_commit()
+    results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
+
+    jobs = plan_jobs(
+        output_dir=args.output_dir,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+        policies=selected_policies,
+        benchmarks=selected_benchmarks,
+    )
+
+    for job in jobs:
+        script = render_sbatch_script(
+            job=job,
+            output_dir=args.output_dir,
+            results_repo_id=results_repo_id,
+            git_commit=git_commit,
+        )
+        script_path = Path(job.script_path)
+        script_path.write_text(script)
+        script_path.chmod(0o755)
+        if args.submit:
+            subprocess.run(["sbatch", str(script_path)], check=True)
+
+    manifest_path = write_manifest(
+        output_dir=args.output_dir,
+        jobs=jobs,
+        git_commit=git_commit,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+    )
+    print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
+    print(f"Manifest: {manifest_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,48 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM huggingface/lerobot-gpu:latest
+
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+RUN uv pip install --no-cache \
+        "robosuite==1.4.1" bddl easydict mujoco matplotlib wand scikit-image gym
+
+ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
+RUN git clone --depth=1 https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
+    && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
+    && uv pip uninstall hf-libero 2>/dev/null || true
+ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
+
+RUN python -c "\
+from huggingface_hub import hf_hub_download; \
+hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
+                filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
+    && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
+    && mv /tmp/libero-plus-dl/extract/inspire/hdd/project/embodied-multimodality/public/syfei/libero_new/release/dataset/LIBERO-plus-0/assets \
+          ${LIBERO_PLUS_ROOT}/assets \
+    && rm -rf /tmp/libero-plus-dl
+
+RUN mkdir -p /home/user_lerobot/.libero \
+    && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
+       > /home/user_lerobot/.libero/config.yaml
+
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,39 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM huggingface/lerobot-gpu:latest
+
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
+
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         libvulkan1 libvulkan-dev mesa-vulkan-drivers \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
+RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
+    && uv pip install --no-cache --override /tmp/robomme_override.txt \
+         -e ".[smolvla,av-dep]" \
+         "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
+    && python -c "import robomme; print('robomme import OK')"
+
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -31,10 +31,22 @@ from __future__ import annotations

 import argparse
 import json
+import re
 import sys
 from pathlib import Path


+# LIBERO-plus derives task.language by space-joining the perturbation-variant
+# filename, so strip the perturbation metadata blob to recover the base prompt.
+_LIBERO_PERTURBATION_TAIL_RE = re.compile(
+    r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
+)
+
+
+def _strip_libero_perturbation_tail(instruction: str) -> str:
+    return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
+
+
 def _libero_descriptions(task_suite: str) -> dict[str, str]:
    from libero.libero import benchmark  # type: ignore[import-untyped]

@@ -47,7 +59,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
        )
        return {}
    suite = suite_dict[task_suite]()
-    return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
+    return {
+        f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
+        for i in range(suite.n_tasks)
+    }


 def _metaworld_descriptions(task_name: str) -> dict[str, str]:
@@ -57,6 +72,14 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
    return {f"{task_name}_0": label}


+def _robomme_descriptions(task_names: str) -> dict[str, str]:
+    return {
+        f"{task_name}_0": task_name.replace("_", " ").strip()
+        for task_name in (task.strip() for task in task_names.split(","))
+        if task_name
+    }
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
@@ -66,10 +89,12 @@ def main() -> int:

    descriptions: dict[str, str] = {}
    try:
-        if args.env == "libero":
+        if args.env in {"libero", "libero_plus"}:
            descriptions = _libero_descriptions(args.task)
        elif args.env == "metaworld":
            descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robomme":
+            descriptions = _robomme_descriptions(args.task)
        else:
            print(
                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
@@ -0,0 +1,27 @@
+---
+title: LeRobot Benchmark Leaderboard
+emoji: 🤖
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Benchmark history for LeRobot policy x benchmark runs
+---
+
+# LeRobot Benchmark Leaderboard
+
+This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
+
+- Latest result per policy and benchmark
+- Historical trends over time
+- Direct links to uploaded eval and config artifacts
+
+## Configuration
+
+Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
+at a different public dataset. The default is:
+
+- `lerobot/benchmark-history`
@@ -0,0 +1,226 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from huggingface_hub import HfApi, hf_hub_download
+
+RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
+CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+CACHE_TTL_S = 300
+
+_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
+
+
+def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
+    overall = row.get("eval", {}).get("overall", {})
+    resources = row.get("resources", {})
+    timings = row.get("timings", {})
+    artifact_urls = row.get("artifact_urls", {})
+    return {
+        "created_at": row.get("created_at"),
+        "benchmark": row.get("benchmark"),
+        "policy": row.get("policy"),
+        "success_rate": overall.get("pc_success"),
+        "n_episodes": overall.get("n_episodes"),
+        "avg_sum_reward": overall.get("avg_sum_reward"),
+        "train_wall_time_s": timings.get("train_wall_time_s"),
+        "eval_wall_time_s": timings.get("eval_wall_time_s"),
+        "total_wall_time_s": timings.get("total_wall_time_s"),
+        "num_gpus": resources.get("num_gpus"),
+        "microbatch_per_gpu": resources.get("microbatch_per_gpu"),
+        "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
+        "effective_batch_size": resources.get("effective_batch_size"),
+        "git_commit": row.get("git_commit"),
+        "row_url": artifact_urls.get("row"),
+        "eval_info_url": artifact_urls.get("eval_info"),
+        "train_config_url": artifact_urls.get("train_config"),
+    }
+
+
+def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
+    cache_key = f"rows::{repo_id}"
+    cached = _CACHE.get(cache_key)
+    if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
+        return cached[1]
+
+    api = HfApi()
+    files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
+    records: list[dict[str, Any]] = []
+    for path_in_repo in sorted(files, reverse=True):
+        local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
+        with open(local_path) as f:
+            row = json.load(f)
+        records.append(_row_to_record(row))
+
+    df = pd.DataFrame.from_records(records)
+    if not df.empty:
+        df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
+        df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
+    _CACHE[cache_key] = (time.monotonic(), df)
+    return df
+
+
+def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
+    if df.empty:
+        return df
+    latest = (
+        df.sort_values("created_at", ascending=False)
+        .groupby(["benchmark", "policy"], as_index=False)
+        .first()
+        .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
+    )
+    return latest[
+        [
+            "benchmark",
+            "policy",
+            "success_rate",
+            "n_episodes",
+            "train_wall_time_s",
+            "eval_wall_time_s",
+            "num_gpus",
+            "effective_batch_size",
+            "git_commit",
+            "row_url",
+            "eval_info_url",
+            "train_config_url",
+        ]
+    ]
+
+
+def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return px.line(title="No benchmark rows found")
+    fig = px.line(
+        filtered.sort_values("created_at"),
+        x="created_at",
+        y="success_rate",
+        color="policy",
+        markers=True,
+        hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
+        title=f"{benchmark} success rate history",
+    )
+    fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
+    return fig
+
+
+def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return "No matching runs yet."
+    latest = filtered.sort_values("created_at", ascending=False).iloc[0]
+    row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
+    eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
+    train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
+    lines = [
+        f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
+        f"Success rate: `{latest['success_rate']}`",
+        f"GPUs: `{latest['num_gpus']}`",
+        f"Effective batch size: `{latest['effective_batch_size']}`",
+        f"Commit: `{latest['git_commit']}`",
+    ]
+    if row_link:
+        lines.append(f"Row JSON: [open]({row_link})")
+    if eval_link:
+        lines.append(f"Eval Info: [open]({eval_link})")
+    if train_link:
+        lines.append(f"Train Config: [open]({train_link})")
+    return "\n\n".join(lines)
+
+
+def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
+    df = load_rows()
+    latest_table = make_latest_table(df)
+    benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+    if benchmark not in benchmark_names and benchmark_names:
+        benchmark = benchmark_names[0]
+    policy_choices = ["All"]
+    if benchmark and not df.empty:
+        policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
+    if policy not in policy_choices:
+        policy = "All"
+    history = make_history_figure(df, benchmark, policy)
+    summary = make_run_markdown(df, benchmark, policy)
+    return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
+
+
+with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
+    gr.Markdown(
+        f"""
+# LeRobot Benchmark Leaderboard
+
+Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
+"""
+    )
+
+    with gr.Row():
+        benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
+        policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
+        refresh_button = gr.Button("Refresh")
+
+    latest_table = gr.Dataframe(label="Latest Results", interactive=False)
+    history_plot = gr.Plot(label="History")
+    latest_summary = gr.Markdown()
+
+    def _initial_state():
+        df = load_rows()
+        benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+        benchmark = benchmarks[0] if benchmarks else ""
+        latest, policy_choices, history, summary = refresh_view(benchmark, "All")
+        return (
+            gr.update(choices=benchmarks, value=benchmark),
+            policy_choices,
+            latest,
+            history,
+            summary,
+        )
+
+    demo.load(
+        _initial_state,
+        outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
+    )
+    refresh_button.click(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    benchmark_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    policy_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+
+
+if __name__ == "__main__":
+    demo.launch()
@@ -0,0 +1,4 @@
+gradio>=5.0.0,<6.0.0
+plotly>=5.18.0
+pandas>=2.0.0
+huggingface-hub>=1.0.0,<2.0.0
@@ -67,11 +67,17 @@ class EvalConfig:
    # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
    # Set to 0 for auto-tuning based on available CPU cores and n_episodes.
    batch_size: int = 0
+    # Number of rollout videos to save per evaluated task. Set to 0 to disable videos.
+    max_episodes_rendered: int = 10
    # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
    # Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1.
    use_async_envs: bool = True

    def __post_init__(self) -> None:
+        if self.max_episodes_rendered < 0:
+            raise ValueError(
+                f"`max_episodes_rendered` must be non-negative, got {self.max_episodes_rendered}."
+            )
        if self.batch_size == 0:
            self.batch_size = self._auto_batch_size()
        if self.batch_size > self.n_episodes:
@@ -56,6 +56,7 @@ class TrainPipelineConfig(HubMixin):
    # Number of workers for the dataloader.
    num_workers: int = 4
    batch_size: int = 8
+    gradient_accumulation_steps: int = 1
    steps: int = 100_000
    eval_freq: int = 20_000
    log_freq: int = 200
@@ -132,6 +133,11 @@ class TrainPipelineConfig(HubMixin):
        if isinstance(self.dataset.repo_id, list):
            raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")

+        if self.gradient_accumulation_steps <= 0:
+            raise ValueError(
+                f"`gradient_accumulation_steps` must be strictly positive, got {self.gradient_accumulation_steps}."
+            )
+
        if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
            raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
        elif self.use_policy_training_preset and not self.resume:
@@ -18,7 +18,15 @@
 # from lerobot.utils.import_utils import require_package
 # require_package("gymnasium", extra="<update_extra>", import_name="gymnasium")

-from .configs import AlohaEnv, EnvConfig, HILSerlRobotEnvConfig, HubEnvConfig, PushtEnv
+from .configs import (
+    AlohaEnv,
+    EnvConfig,
+    HILSerlRobotEnvConfig,
+    HubEnvConfig,
+    LiberoPlusEnv,
+    PushtEnv,
+    RoboMMEEnv,
+)
 from .factory import make_env, make_env_config, make_env_pre_post_processors
 from .utils import check_env_attributes_and_types, close_envs, env_to_policy_features, preprocess_observation

@@ -27,7 +35,9 @@ __all__ = [
    "EnvConfig",
    "HILSerlRobotEnvConfig",
    "HubEnvConfig",
+    "LiberoPlusEnv",
    "PushtEnv",
+    "RoboMMEEnv",
    "check_env_attributes_and_types",
    "close_envs",
    "env_to_policy_features",
@@ -574,3 +574,58 @@ class IsaaclabArenaEnv(HubEnvConfig):
            ),
            PolicyProcessorPipeline(steps=[]),
        )
+
+
+@EnvConfig.register_subclass("libero_plus")
+@dataclass
+class LiberoPlusEnv(LiberoEnv):
+    """Config for LIBERO-plus robustness benchmark evaluation."""
+
+    task: str = "libero_spatial"
+
+
+@EnvConfig.register_subclass("robomme")
+@dataclass
+class RoboMMEEnv(EnvConfig):
+    """RoboMME memory-augmented manipulation benchmark."""
+
+    task: str = "PickXtimes"
+    fps: int = 10
+    episode_length: int = 300
+    action_space: str = "joint_angle"
+    dataset_split: str = "test"
+    task_ids: list[int] | None = None
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)),
+            "image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "image": f"{OBS_IMAGES}.image",
+            "wrist_image": f"{OBS_IMAGES}.wrist_image",
+            OBS_STATE: OBS_STATE,
+        }
+    )
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from .robomme import create_robomme_envs
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robomme_envs(
+            task=self.task,
+            n_envs=n_envs,
+            action_space_type=self.action_space,
+            dataset=self.dataset_split,
+            episode_length=self.episode_length,
+            task_ids=self.task_ids,
+            env_cls=env_cls,
+        )
@@ -16,6 +16,7 @@
 from __future__ import annotations

 import os
+import re
 from collections import defaultdict
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
@@ -69,14 +70,28 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
    return ids


+# LIBERO-plus perturbation variants encode the perturbation in the filename
+# but on disk only the base `.pruned_init` exists — strip the suffix to match
+# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
+# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
+_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
+
+
 def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
-    init_states_path = (
-        Path(get_libero_path("init_states"))
-        / task_suite.tasks[i].problem_folder
-        / task_suite.tasks[i].init_states_file
-    )
-    init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
-    return init_states
+    task = task_suite.tasks[i]
+    filename = Path(task.init_states_file)
+    root = Path(get_libero_path("init_states"))
+
+    # `_add_` / `_level` variants store extra-object layouts under libero_newobj/
+    # as a flat array that must be reshaped to (1, -1).
+    if "_add_" in filename.name or "_level" in filename.name:
+        init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
+        init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
+        return init_states.reshape(1, -1)
+
+    stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
+    init_states_path = root / task.problem_folder / stripped
+    return torch.load(init_states_path, weights_only=False)  # nosec B614


 def get_libero_dummy_action():
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RoboMME environment wrapper for LeRobot evaluation."""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+ROBOMME_TASKS = [
+    "BinFill",
+    "PickXtimes",
+    "SwingXtimes",
+    "StopCube",
+    "VideoUnmask",
+    "VideoUnmaskSwap",
+    "ButtonUnmask",
+    "ButtonUnmaskSwap",
+    "PickHighlight",
+    "VideoRepick",
+    "VideoPlaceButton",
+    "VideoPlaceOrder",
+    "MoveCube",
+    "InsertPeg",
+    "PatternLock",
+    "RouteStick",
+]
+
+
+class RoboMMEGymEnv(gym.Env):
+    """Thin Gymnasium wrapper around a single RoboMME episode env."""
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+    def __init__(
+        self,
+        task: str = "PickXtimes",
+        action_space_type: str = "joint_angle",
+        dataset: str = "test",
+        episode_idx: int = 0,
+        max_steps: int = 300,
+    ):
+        super().__init__()
+        from robomme.env_record_wrapper import BenchmarkEnvBuilder
+
+        self._builder = BenchmarkEnvBuilder(
+            env_id=task,
+            dataset=dataset,
+            action_space=action_space_type,
+            gui_render=False,
+            max_steps=max_steps,
+        )
+        self._max_episode_steps = max_steps
+        self._episode_idx = episode_idx
+        self._max_steps = max_steps
+        self._env = None
+        self._last_raw_obs: dict | None = None
+
+        action_dim = 8 if action_space_type == "joint_angle" else 7
+        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
+        self.observation_space = spaces.Dict(
+            {
+                "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                "state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
+            }
+        )
+
+    def reset(self, *, seed=None, options=None):
+        super().reset(seed=seed)
+        self._env = self._builder.make_env_for_episode(
+            episode_idx=self._episode_idx,
+            max_steps=self._max_steps,
+        )
+        obs, info = self._env.reset()
+        self._last_raw_obs = obs
+        return self._convert_obs(obs), self._convert_info(info)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self._env.step(action)
+        self._last_raw_obs = obs
+
+        terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
+        truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
+
+        status = info.get("status", "ongoing")
+        conv_info = self._convert_info(info)
+        conv_info["is_success"] = status == "success"
+
+        return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
+
+    def render(self) -> np.ndarray | None:
+        if self._last_raw_obs is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        front = self._last_raw_obs.get("front_rgb_list")
+        if front is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        frame = front[-1] if isinstance(front, list) else front
+        return np.asarray(frame, dtype=np.uint8)
+
+    def _convert_obs(self, obs: dict) -> dict:
+        front_rgb = (
+            obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
+        )
+        wrist_rgb = (
+            obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
+        )
+        joint_state = (
+            obs["joint_state_list"][-1]
+            if isinstance(obs["joint_state_list"], list)
+            else obs["joint_state_list"]
+        )
+        gripper_state = (
+            obs["gripper_state_list"][-1]
+            if isinstance(obs["gripper_state_list"], list)
+            else obs["gripper_state_list"]
+        )
+
+        joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
+        gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
+        state = np.concatenate([joint, gripper])
+
+        return {
+            "image": np.asarray(front_rgb, dtype=np.uint8),
+            "wrist_image": np.asarray(wrist_rgb, dtype=np.uint8),
+            "state": state,
+        }
+
+    def _convert_info(self, info: dict) -> dict:
+        return {
+            "status": info.get("status", "ongoing"),
+            "task_goal": info.get("task_goal", ""),
+        }
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    action_space_type: str,
+    dataset: str,
+    episode_length: int,
+    task_id: int,
+) -> list[Callable[[], RoboMMEGymEnv]]:
+    def _make_one(episode_index: int) -> RoboMMEGymEnv:
+        return RoboMMEGymEnv(
+            task=task,
+            action_space_type=action_space_type,
+            dataset=dataset,
+            episode_idx=episode_index,
+            max_steps=episode_length,
+        )
+
+    return [partial(_make_one, task_id + i) for i in range(n_envs)]
+
+
+def create_robomme_envs(
+    task: str,
+    n_envs: int = 1,
+    action_space_type: str = "joint_angle",
+    dataset: str = "test",
+    episode_length: int = 300,
+    task_ids: list[int] | None = None,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, gym.vector.VectorEnv]]:
+    """Create vectorized RoboMME environments for evaluation."""
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    if task_ids is None:
+        task_ids = [0]
+
+    task_names = [t.strip() for t in task.split(",") if t.strip()]
+    out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
+    for task_name in task_names:
+        envs_by_task: dict[int, gym.vector.VectorEnv] = {}
+        for task_id in task_ids:
+            fns = _make_env_fns(
+                task=task_name,
+                n_envs=n_envs,
+                action_space_type=action_space_type,
+                dataset=dataset,
+                episode_length=episode_length,
+                task_id=task_id,
+            )
+            envs_by_task[task_id] = env_cls(fns)
+        out[task_name] = envs_by_task
+    return out
@@ -572,7 +572,7 @@ def eval_main(cfg: EvalPipelineConfig):
            preprocessor=preprocessor,
            postprocessor=postprocessor,
            n_episodes=cfg.eval.n_episodes,
-            max_episodes_rendered=10,
+            max_episodes_rendered=cfg.eval.max_episodes_rendered,
            videos_dir=Path(cfg.output_dir) / "videos",
            start_seed=cfg.seed,
            max_parallel_tasks=cfg.env.max_parallel_tasks,
@@ -71,6 +71,9 @@ def update_policy(
    lr_scheduler=None,
    lock=None,
    rabc_weights_provider=None,
+    *,
+    do_optimizer_step: bool = True,
+    loss_divisor: int = 1,
 ) -> tuple[MetricsTracker, dict]:
    """
    Performs a single training step to update the policy's weights.
@@ -122,34 +125,38 @@ def update_policy(
            loss, output_dict = policy.forward(batch)

        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+        logged_loss = loss.detach()
+        if loss_divisor > 1:
+            loss = loss / loss_divisor

    # Use accelerator's backward method
    accelerator.backward(loss)

-    # Clip gradients if specified
-    if grad_clip_norm > 0:
-        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
-    else:
-        grad_norm = torch.nn.utils.clip_grad_norm_(
-            policy.parameters(), float("inf"), error_if_nonfinite=False
-        )
+    grad_norm_value = 0.0
+    if do_optimizer_step:
+        if grad_clip_norm > 0:
+            grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                policy.parameters(), float("inf"), error_if_nonfinite=False
+            )
+        grad_norm_value = grad_norm.item()

-    # Optimizer step
-    with lock if lock is not None else nullcontext():
-        optimizer.step()
+        with lock if lock is not None else nullcontext():
+            optimizer.step()

-    optimizer.zero_grad()
+        optimizer.zero_grad()

-    # Step through pytorch scheduler at every batch instead of epoch
-    if lr_scheduler is not None:
-        lr_scheduler.step()
+        # Step through pytorch scheduler at every optimizer step instead of epoch
+        if lr_scheduler is not None:
+            lr_scheduler.step()

-    # Update internal buffers if policy has update method
-    if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
-        accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()
+        # Update internal buffers if policy has update method
+        if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
+            accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()

-    train_metrics.loss = loss.item()
-    train_metrics.grad_norm = grad_norm.item()
+    train_metrics.loss = logged_loss.item()
+    train_metrics.grad_norm = grad_norm_value
    train_metrics.lr = optimizer.param_groups[0]["lr"]
    train_metrics.update_s = time.perf_counter() - start_time
    return train_metrics, output_dict
@@ -359,8 +366,16 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
        logging.info(f"{dataset.num_episodes=}")
        num_processes = accelerator.num_processes
-        effective_bs = cfg.batch_size * num_processes
-        logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
+        micro_batch = cfg.batch_size
+        logical_batch = cfg.batch_size * cfg.gradient_accumulation_steps
+        effective_bs = logical_batch * num_processes
+        logging.info(
+            "Effective batch size: %s x %s x %s = %s",
+            micro_batch,
+            cfg.gradient_accumulation_steps,
+            num_processes,
+            effective_bs,
+        )
        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")

@@ -407,9 +422,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
    }

    # Keep global batch size for logging; MetricsTracker handles world size internally.
-    effective_batch_size = cfg.batch_size * accelerator.num_processes
+    logical_batch_size = cfg.batch_size * cfg.gradient_accumulation_steps
+    effective_batch_size = logical_batch_size * accelerator.num_processes
    train_tracker = MetricsTracker(
-        cfg.batch_size,
+        logical_batch_size,
        dataset.num_frames,
        dataset.num_episodes,
        train_metrics,
@@ -431,21 +447,62 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        )

    for _ in range(step, cfg.steps):
-        start_time = time.perf_counter()
-        batch = next(dl_iter)
-        batch = preprocessor(batch)
-        train_tracker.dataloading_s = time.perf_counter() - start_time
+        step_dataloading_s = 0.0
+        step_update_s = 0.0
+        step_losses = []
+        step_grad_norm = 0.0
+        step_lr = optimizer.param_groups[0]["lr"]
+        output_dict = {}
+        optimizer.zero_grad()
+        for accumulation_idx in range(cfg.gradient_accumulation_steps):
+            start_time = time.perf_counter()
+            batch = next(dl_iter)
+            batch = preprocessor(batch)
+            step_dataloading_s += time.perf_counter() - start_time

-        train_tracker, output_dict = update_policy(
-            train_tracker,
-            policy,
-            batch,
-            optimizer,
-            cfg.optimizer.grad_clip_norm,
-            accelerator=accelerator,
-            lr_scheduler=lr_scheduler,
-            rabc_weights_provider=rabc_weights,
-        )
+            is_last_microbatch = accumulation_idx == cfg.gradient_accumulation_steps - 1
+            micro_metrics = MetricsTracker(
+                cfg.batch_size,
+                dataset.num_frames,
+                dataset.num_episodes,
+                {
+                    "loss": AverageMeter("loss", ":.3f"),
+                    "grad_norm": AverageMeter("grdn", ":.3f"),
+                    "lr": AverageMeter("lr", ":0.1e"),
+                    "update_s": AverageMeter("updt_s", ":.3f"),
+                },
+                accelerator=accelerator,
+            )
+            sync_context = (
+                nullcontext()
+                if is_last_microbatch or accelerator.num_processes == 1
+                else accelerator.no_sync(policy)
+            )
+            with sync_context:
+                micro_metrics, micro_output_dict = update_policy(
+                    micro_metrics,
+                    policy,
+                    batch,
+                    optimizer,
+                    cfg.optimizer.grad_clip_norm,
+                    accelerator=accelerator,
+                    lr_scheduler=lr_scheduler if is_last_microbatch else None,
+                    rabc_weights_provider=rabc_weights,
+                    do_optimizer_step=is_last_microbatch,
+                    loss_divisor=cfg.gradient_accumulation_steps,
+                )
+            step_update_s += micro_metrics.update_s.val
+            step_losses.append(micro_metrics.loss.val)
+            if is_last_microbatch:
+                step_grad_norm = micro_metrics.grad_norm.val
+                step_lr = micro_metrics.lr.val
+                output_dict = micro_output_dict
+
+        train_tracker.loss = sum(step_losses) / len(step_losses)
+        train_tracker.grad_norm = step_grad_norm
+        train_tracker.lr = step_lr
+        train_tracker.update_s = step_update_s
+        train_tracker.dataloading_s = step_dataloading_s

        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
        # increment `step` here.
@@ -510,7 +567,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                        postprocessor=postprocessor,
                        n_episodes=cfg.eval.n_episodes,
                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
+                        max_episodes_rendered=cfg.eval.max_episodes_rendered,
                        start_seed=cfg.seed,
                        max_parallel_tasks=cfg.env.max_parallel_tasks,
                    )
@@ -541,7 +598,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                if wandb_logger:
                    wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
                    wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
-                    wandb_logger.log_video(eval_info["overall"]["video_paths"][0], step, mode="eval")
+                    video_paths = eval_info["overall"].get("video_paths", [])
+                    if video_paths:
+                        wandb_logger.log_video(video_paths[0], step, mode="eval")

            accelerator.wait_for_everyone()

@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from huggingface_hub import HfApi
+
+
+def utc_timestamp_slug(now: datetime | None = None) -> str:
+    current = now or datetime.now(UTC)
+    return current.strftime("%Y%m%dT%H%M%SZ")
+
+
+def make_hub_file_url(repo_id: str, path_in_repo: str, repo_type: str = "dataset") -> str:
+    prefix = "datasets/" if repo_type == "dataset" else ""
+    return f"https://huggingface.co/{prefix}{repo_id}/resolve/main/{path_in_repo}"
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True))
+
+
+@dataclass(frozen=True)
+class UploadTarget:
+    local_path: Path
+    path_in_repo: str
+
+
+def upload_targets(
+    repo_id: str,
+    targets: list[UploadTarget],
+    *,
+    repo_type: str = "dataset",
+    token: str | None = None,
+    private: bool | None = None,
+    commit_message: str | None = None,
+) -> dict[str, str]:
+    api = HfApi(token=token)
+    api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
+    uploaded: dict[str, str] = {}
+    for target in targets:
+        api.upload_file(
+            path_or_fileobj=str(target.local_path),
+            path_in_repo=target.path_in_repo,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            commit_message=commit_message or f"Upload {target.path_in_repo}",
+        )
+        uploaded[target.path_in_repo] = make_hub_file_url(repo_id, target.path_in_repo, repo_type=repo_type)
+    return uploaded
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from benchmarks.run_benchmark_matrix import (
+    PlannedJob,
+    compute_gradient_accumulation_steps,
+    plan_jobs,
+    render_sbatch_script,
+    write_manifest,
+)
+
+
+def _one_job(job_list: list[PlannedJob]) -> PlannedJob:
+    assert len(job_list) == 1
+    return job_list[0]
+
+
+def test_compute_gradient_accumulation_steps_for_fixed_effective_batch():
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=8,
+        microbatch_per_gpu=32,
+    ) == 1
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=4,
+        microbatch_per_gpu=32,
+    ) == 2
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=1,
+        microbatch_per_gpu=32,
+    ) == 8
+
+
+def test_plan_jobs_filters_libero_plus_only(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "act"],
+        benchmarks=["libero_plus"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"]
+    assert [job.policy for job in jobs] == ["pi0", "act"]
+
+
+def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"]
+    assert jobs[0].effective_batch_size == 256
+    assert jobs[1].effective_batch_size == 256
+
+
+def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "xvla", "act"],
+        benchmarks=["robomme"],
+    )
+    by_policy = {job.policy: job for job in jobs}
+
+    assert by_policy["pi0"].num_gpus == 8
+    assert by_policy["pi0"].gradient_accumulation_steps == 1
+    assert by_policy["xvla"].num_gpus == 4
+    assert by_policy["xvla"].gradient_accumulation_steps == 2
+    assert by_policy["act"].num_gpus == 1
+    assert by_policy["act"].gradient_accumulation_steps == 8
+
+
+def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path):
+    job = _one_job(
+        plan_jobs(
+            output_dir=tmp_path,
+            hub_org="lerobot",
+            results_repo="lerobot/benchmark-history",
+            policies=["pi0_fast"],
+            benchmarks=["robomme"],
+        )
+    )
+
+    script = render_sbatch_script(
+        job=job,
+        output_dir=tmp_path,
+        results_repo_id="lerobot/benchmark-history",
+        git_commit="deadbeef",
+    )
+
+    assert "docker/Dockerfile" not in script
+    assert "lerobot-benchmark-robomme:latest" in script
+    assert '--dataset.repo_id="lerobot/robomme"' in script
+    assert '--env.type="robomme"' in script
+    assert "--gradient_accumulation_steps=1" in script
+    assert "lerobot-train-tokenizer" in script
+    assert "benchmarks/publish_benchmark_result.py" in script
+
+
+def test_write_manifest_records_job_metadata(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+    manifest_path = write_manifest(
+        output_dir=tmp_path,
+        jobs=jobs,
+        git_commit="deadbeef",
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+    )
+
+    manifest = json.loads(manifest_path.read_text())
+    assert manifest["git_commit"] == "deadbeef"
+    assert manifest["results_repo"] == "lerobot/benchmark-history"
+    assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"]
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock
+
+import numpy as np
+
+
+def _install_robomme_stub():
+    stub = ModuleType("robomme")
+    wrapper_stub = ModuleType("robomme.env_record_wrapper")
+
+    class FakeBuilder:
+        def __init__(self, **kwargs):
+            pass
+
+        def make_env_for_episode(self, episode_idx: int, max_steps: int):
+            env = MagicMock()
+            obs = {
+                "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "joint_state_list": [np.zeros(7, dtype=np.float32)],
+                "gripper_state_list": [np.zeros(2, dtype=np.float32)],
+            }
+            env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
+            env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
+            return env
+
+    wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
+    stub.env_record_wrapper = wrapper_stub
+    sys.modules["robomme"] = stub
+    sys.modules["robomme.env_record_wrapper"] = wrapper_stub
+
+
+def _uninstall_robomme_stub():
+    sys.modules.pop("robomme", None)
+    sys.modules.pop("robomme.env_record_wrapper", None)
+
+
+def test_robomme_env_config_defaults():
+    from lerobot.envs.configs import RoboMMEEnv
+
+    cfg = RoboMMEEnv()
+    assert cfg.task == "PickXtimes"
+    assert cfg.fps == 10
+    assert cfg.episode_length == 300
+    assert cfg.action_space == "joint_angle"
+    assert cfg.dataset_split == "test"
+    assert cfg.task_ids is None
+
+
+def test_robomme_features_map():
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
+
+    cfg = RoboMMEEnv()
+    assert cfg.features_map[ACTION] == ACTION
+    assert cfg.features_map["image"] == f"{OBS_IMAGES}.image"
+    assert cfg.features_map["wrist_image"] == f"{OBS_IMAGES}.wrist_image"
+    assert cfg.features_map[OBS_STATE] == OBS_STATE
+
+
+def test_convert_obs_list_format():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import RoboMMEGymEnv
+
+        env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
+
+        front = np.full((256, 256, 3), 42, dtype=np.uint8)
+        wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
+        joints = np.arange(7, dtype=np.float32)
+        gripper = np.array([0.5, 0.5], dtype=np.float32)
+
+        obs_raw = {
+            "front_rgb_list": [np.zeros_like(front), front],
+            "wrist_rgb_list": [np.zeros_like(wrist), wrist],
+            "joint_state_list": [np.zeros(7, dtype=np.float32), joints],
+            "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
+        }
+
+        result = env._convert_obs(obs_raw)
+        np.testing.assert_array_equal(result["image"], front)
+        np.testing.assert_array_equal(result["wrist_image"], wrist)
+        assert result["state"].shape == (8,)
+        np.testing.assert_array_almost_equal(result["state"][:7], joints)
+        assert result["state"][7] == gripper[0]
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_create_robomme_envs_multi_task():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = MagicMock(return_value=MagicMock())
+        result = create_robomme_envs(
+            task="PickXtimes,BinFill,StopCube",
+            n_envs=1,
+            env_cls=env_cls,
+        )
+
+        assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
+    finally:
+        _uninstall_robomme_stub()
				`@@ -0,0 +1 @@`
				`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`