diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 79d5614b2..15adb1242 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -310,3 +310,181 @@ jobs: name: metaworld-metrics path: /tmp/metaworld-artifacts/metrics.json if-no-files-found: warn + + # ── LIBERO-plus ─────────────────────────────────────────────────────────── + libero-plus-integration-test: + name: LIBERO-plus — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Build LIBERO-plus benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero_plus + push: false + load: true + tags: lerobot-benchmark-libero-plus:ci + cache-from: type=local,src=/tmp/.buildx-cache-libero-plus + cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max + + - name: Run LIBERO-plus smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name libero-plus-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-libero-plus:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_libero_plus \ + --env.type=libero_plus \ + --env.task=libero_spatial \ + '--env.task_ids=[0,100,260,500,1000,1500,2000,2400]' \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero_plus --task libero_spatial \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy LIBERO-plus artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-plus-artifacts + docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true + docker rm -f libero-plus-eval || true + + - name: Parse LIBERO-plus eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-plus-artifacts \ + --env libero_plus \ + --task libero_spatial \ + --policy lerobot/smolvla_libero_plus + + - name: Upload LIBERO-plus rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-rollout-video + path: /tmp/libero-plus-artifacts/videos/ + if-no-files-found: warn + + - name: Upload LIBERO-plus eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-metrics + path: /tmp/libero-plus-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOMME ─────────────────────────────────────────────────────────────── + robomme-integration-test: + name: RoboMME — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Build RoboMME benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robomme + push: false + load: true + tags: lerobot-benchmark-robomme:ci + + - name: Run RoboMME smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robomme-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-robomme:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robomme \ + --env.type=robomme \ + --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \ + --env.dataset_split=test \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=3 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robomme --task PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboMME artifacts from container + if: always() + run: | + mkdir -p /tmp/robomme-artifacts + docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true + docker rm -f robomme-eval || true + + - name: Parse RoboMME eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robomme-artifacts \ + --env robomme \ + --task PickXtimes \ + --policy lerobot/smolvla_robomme + + - name: Upload RoboMME rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-rollout-video + path: /tmp/robomme-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboMME eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-metrics + path: /tmp/robomme-artifacts/metrics.json + if-no-files-found: warn diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..1169500ea --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. diff --git a/benchmarks/publish_benchmark_result.py b/benchmarks/publish_benchmark_result.py new file mode 100644 index 000000000..8df5a6458 --- /dev/null +++ b/benchmarks/publish_benchmark_result.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Publish benchmark rows and lightweight artifacts to a Hub dataset.""" + +from __future__ import annotations + +import argparse +import json +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug + + +def load_json_if_exists(path: Path) -> dict[str, Any] | None: + if not path.exists(): + return None + return json.loads(path.read_text()) + + +def find_latest_train_config_path(run_root: Path) -> Path | None: + checkpoints_dir = run_root / "train" / "checkpoints" + if not checkpoints_dir.exists(): + return None + candidates = sorted( + checkpoints_dir.glob("*/pretrained_model/train_config.json"), + key=lambda path: path.parts[-3], + ) + return candidates[-1] if candidates else None + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--benchmark", required=True) + parser.add_argument("--policy", required=True) + parser.add_argument("--run_root", required=True, type=Path) + parser.add_argument("--results_repo", required=True) + parser.add_argument("--git_commit", required=True) + parser.add_argument("--num_gpus", required=True, type=int) + parser.add_argument("--microbatch_per_gpu", required=True, type=int) + parser.add_argument("--gradient_accumulation_steps", required=True, type=int) + parser.add_argument("--effective_batch_size", required=True, type=int) + parser.add_argument("--train_wall_time_s", required=True, type=float) + parser.add_argument("--eval_wall_time_s", required=True, type=float) + parser.add_argument("--slurm_job_id", default="") + parser.add_argument("--docker_image", required=True) + return parser.parse_args() + + +def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]: + now = datetime.now(UTC) + created_at = now.isoformat() + timestamp = utc_timestamp_slug(now) + run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}" + eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {} + train_config_path = find_latest_train_config_path(args.run_root) + train_config = load_json_if_exists(train_config_path) or {} + + artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}" + row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json" + + row = { + "schema_version": 1, + "created_at": created_at, + "run_id": run_id, + "benchmark": args.benchmark, + "policy": args.policy, + "git_commit": args.git_commit, + "slurm_job_id": args.slurm_job_id or None, + "docker_image": args.docker_image, + "resources": { + "num_gpus": args.num_gpus, + "microbatch_per_gpu": args.microbatch_per_gpu, + "gradient_accumulation_steps": args.gradient_accumulation_steps, + "effective_batch_size": args.effective_batch_size, + }, + "timings": { + "train_wall_time_s": args.train_wall_time_s, + "eval_wall_time_s": args.eval_wall_time_s, + "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s, + }, + "eval": { + "overall": eval_info.get("overall", {}), + "per_group": eval_info.get("per_group", {}), + "per_task_count": len(eval_info.get("per_task", [])), + }, + "paths": { + "run_root": str(args.run_root), + "train_dir": str(args.run_root / "train"), + "eval_dir": str(args.run_root / "eval"), + }, + "train_config": train_config, + "artifact_urls": { + "row": make_hub_file_url(args.results_repo, row_path_in_repo), + }, + } + + row_path = args.run_root / "benchmark_row.json" + row_path.parent.mkdir(parents=True, exist_ok=True) + upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)] + + eval_info_path = args.run_root / "eval" / "eval_info.json" + if eval_info_path.exists(): + row["artifact_urls"]["eval_info"] = make_hub_file_url( + args.results_repo, f"{artifact_prefix}/eval_info.json" + ) + upload_list.append( + UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json") + ) + + if train_config_path is not None and train_config_path.exists(): + row["artifact_urls"]["train_config"] = make_hub_file_url( + args.results_repo, f"{artifact_prefix}/train_config.json" + ) + upload_list.append( + UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json") + ) + + row_path.write_text(json.dumps(row, indent=2, sort_keys=True)) + return row, upload_list + + +def main() -> int: + args = parse_args() + row, upload_list = build_row(args) + uploaded = upload_targets( + repo_id=args.results_repo, + targets=upload_list, + repo_type="dataset", + private=False, + commit_message=f"Add benchmark row {row['run_id']}", + ) + row["uploaded_paths"] = uploaded + row_path = args.run_root / "benchmark_row.json" + row_path.write_text(json.dumps(row, indent=2, sort_keys=True)) + print(json.dumps(row, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/run_benchmark_matrix.py b/benchmarks/run_benchmark_matrix.py new file mode 100644 index 000000000..5ef50c4ad --- /dev/null +++ b/benchmarks/run_benchmark_matrix.py @@ -0,0 +1,647 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate lightweight SLURM jobs for policy x benchmark benchmarking.""" + +from __future__ import annotations + +import argparse +import json +import math +import subprocess +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from lerobot.utils.history_repo import utc_timestamp_slug + +MAX_GPUS = 8 +MIN_GPUS = 1 +DEFAULT_STEPS = 20_000 +DEFAULT_EFFECTIVE_BATCH_SIZE = 256 +DEFAULT_MICROBATCH_PER_GPU = 32 +DEFAULT_EVAL_BATCH_SIZE = 1 +DEFAULT_CPUS_PER_GPU = 8 +DEFAULT_MEMORY_PER_GPU_GB = 40 + + +@dataclass(frozen=True) +class BenchmarkSpec: + name: str + dataset_repo_id: str + docker_image: str + eval_env_type: str + eval_task: str + eval_n_episodes: int + train_steps: int = DEFAULT_STEPS + effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE + train_extra_args: dict[str, Any] = field(default_factory=dict) + eval_extra_args: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class PolicySpec: + name: str + policy_type: str + num_gpus: int + policy_path: str | None = None + microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU + extra_train_args: dict[str, Any] = field(default_factory=dict) + extra_eval_args: dict[str, Any] = field(default_factory=dict) + needs_tokenizer: bool = False + tokenizer_args: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class PlannedJob: + benchmark: str + policy: str + run_rel: str + num_gpus: int + microbatch_per_gpu: int + gradient_accumulation_steps: int + effective_batch_size: int + docker_image: str + train_args: dict[str, Any] + eval_args: dict[str, Any] + tokenizer_args: dict[str, Any] | None + script_path: str + + +BENCHMARKS: dict[str, BenchmarkSpec] = { + "libero_plus": BenchmarkSpec( + name="libero_plus", + dataset_repo_id="lerobot/libero_plus", + docker_image="lerobot-benchmark-libero-plus:latest", + eval_env_type="libero_plus", + eval_task="libero_spatial,libero_object,libero_goal,libero_10", + eval_n_episodes=10, + train_extra_args={ + "rename_map": { + "observation.images.image": "observation.images.camera1", + "observation.images.image2": "observation.images.camera2", + }, + }, + eval_extra_args={ + "env.camera_name_mapping": { + "agentview_image": "camera1", + "robot0_eye_in_hand_image": "camera2", + }, + "env.max_parallel_tasks": 1, + "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE, + "eval.use_async_envs": False, + "eval.max_episodes_rendered": 0, + "policy.device": "cuda", + }, + ), + "robomme": BenchmarkSpec( + name="robomme", + dataset_repo_id="lerobot/robomme", + docker_image="lerobot-benchmark-robomme:latest", + eval_env_type="robomme", + eval_task=( + "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap," + "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton," + "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick" + ), + eval_n_episodes=50, + train_extra_args={ + "rename_map": { + "observation.images.image": "observation.images.camera1", + "observation.images.wrist_image": "observation.images.camera2", + }, + }, + eval_extra_args={ + "env.dataset_split": "test", + "env.max_parallel_tasks": 1, + "rename_map": { + "observation.images.image": "observation.images.camera1", + "observation.images.wrist_image": "observation.images.camera2", + }, + "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE, + "eval.use_async_envs": False, + "eval.max_episodes_rendered": 0, + "policy.device": "cuda", + }, + ), +} + + +POLICIES: dict[str, PolicySpec] = { + "pi0": PolicySpec( + name="pi0", + policy_type="pi0", + policy_path="lerobot/pi0_base", + num_gpus=8, + extra_train_args={ + "policy.n_action_steps": 30, + "policy.scheduler_decay_steps": DEFAULT_STEPS, + "policy.empty_cameras": 0, + }, + ), + "pi0_fast": PolicySpec( + name="pi0_fast", + policy_type="pi0_fast", + policy_path="lerobot/pi0fast-base", + num_gpus=8, + extra_train_args={ + "policy.n_action_steps": 30, + "policy.scheduler_decay_steps": DEFAULT_STEPS, + "policy.empty_cameras": 0, + }, + needs_tokenizer=True, + tokenizer_args={ + "action_horizon": 30, + "encoded_dims": "0:7", + "normalization_mode": "QUANTILES", + "vocab_size": 1024, + "scale": 10.0, + "push_to_hub": True, + }, + ), + "pi05": PolicySpec( + name="pi05", + policy_type="pi05", + policy_path="lerobot/pi05_base", + num_gpus=8, + extra_train_args={ + "policy.n_action_steps": 30, + "policy.scheduler_decay_steps": DEFAULT_STEPS, + "policy.empty_cameras": 0, + }, + ), + "groot": PolicySpec( + name="groot", + policy_type="groot", + num_gpus=8, + extra_train_args={ + "policy.n_action_steps": 30, + "policy.base_model_path": "nvidia/GR00T-N1.5-3B", + "policy.tune_diffusion_model": True, + "policy.tune_projector": True, + "policy.tune_llm": False, + "policy.tune_visual": False, + "policy.use_bf16": True, + }, + ), + "act": PolicySpec( + name="act", + policy_type="act", + num_gpus=1, + extra_train_args={ + "policy.n_action_steps": 30, + }, + ), + "diffusion": PolicySpec( + name="diffusion", + policy_type="diffusion", + num_gpus=1, + extra_train_args={ + "policy.horizon": 32, + "policy.n_action_steps": 30, + "policy.n_obs_steps": 2, + }, + ), + "smolvla": PolicySpec( + name="smolvla", + policy_type="smolvla", + policy_path="lerobot/smolvla_base", + num_gpus=8, + extra_train_args={ + "policy.n_action_steps": 30, + "policy.load_vlm_weights": True, + "policy.freeze_vision_encoder": False, + "policy.train_expert_only": False, + "policy.scheduler_decay_steps": DEFAULT_STEPS, + "policy.empty_cameras": 1, + }, + ), + "xvla": PolicySpec( + name="xvla", + policy_type="xvla", + policy_path="lerobot/xvla-widowx", + num_gpus=4, + extra_train_args={ + "policy.n_action_steps": 32, + "policy.scheduler_decay_steps": DEFAULT_STEPS, + "policy.empty_cameras": 1, + }, + ), + "multi_task_dit": PolicySpec( + name="multi_task_dit", + policy_type="multi_task_dit", + num_gpus=1, + extra_train_args={ + "policy.horizon": 32, + "policy.n_action_steps": 30, + }, + ), +} + + +def normalize_repo_id(hub_org: str, repo_or_id: str) -> str: + return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}" + + +def get_requested_names( + requested: list[str] | None, + available: dict[str, Any], + *, + kind: str, +) -> list[str]: + if not requested: + return list(available) + unknown = sorted(set(requested) - set(available)) + if unknown: + raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}") + return requested + + +def compute_gradient_accumulation_steps( + *, + effective_batch_size: int, + num_gpus: int, + microbatch_per_gpu: int, +) -> int: + per_step_batch = num_gpus * microbatch_per_gpu + if effective_batch_size % per_step_batch != 0: + raise ValueError( + f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and " + f"{microbatch_per_gpu=}." + ) + return effective_batch_size // per_step_batch + + +def make_run_slug() -> str: + return utc_timestamp_slug() + + +def shell_value(value: Any) -> str: + if isinstance(value, bool): + value = "true" if value else "false" + elif isinstance(value, (dict, list)): + value = json.dumps(value, sort_keys=True) + else: + value = str(value) + escaped = ( + value.replace("\\", "\\\\") + .replace('"', '\\"') + .replace("$", "\\$") + .replace("`", "\\`") + ) + return f'"{escaped}"' + + +def format_cli_args(args: dict[str, Any]) -> str: + lines = [] + for key, value in args.items(): + lines.append(f" --{key}={shell_value(value)}") + return " \\\n".join(lines) + + +def build_train_args( + *, + benchmark: BenchmarkSpec, + policy: PolicySpec, + train_dir: str, + gradient_accumulation_steps: int, +) -> dict[str, Any]: + args: dict[str, Any] = { + "dataset.repo_id": benchmark.dataset_repo_id, + "output_dir": train_dir, + "steps": benchmark.train_steps, + "batch_size": policy.microbatch_per_gpu, + "gradient_accumulation_steps": gradient_accumulation_steps, + "eval_freq": 0, + "save_freq": benchmark.train_steps, + "save_checkpoint": True, + "log_freq": 100, + "wandb.enable": False, + "policy.push_to_hub": False, + "policy.device": "cuda", + } + if policy.policy_path: + args["policy.path"] = policy.policy_path + else: + args["policy.type"] = policy.policy_type + args.update(benchmark.train_extra_args) + args.update(policy.extra_train_args) + return args + + +def build_eval_args( + *, + benchmark: BenchmarkSpec, + policy: PolicySpec, + checkpoint_path: str, + eval_dir: str, +) -> dict[str, Any]: + args: dict[str, Any] = { + "policy.path": checkpoint_path, + "env.type": benchmark.eval_env_type, + "env.task": benchmark.eval_task, + "eval.n_episodes": benchmark.eval_n_episodes, + "output_dir": eval_dir, + } + args.update(benchmark.eval_extra_args) + args.update(policy.extra_eval_args) + return args + + +def plan_jobs( + *, + output_dir: Path, + hub_org: str, + results_repo: str, + policies: list[str], + benchmarks: list[str], +) -> list[PlannedJob]: + _ = hub_org + _ = results_repo + scripts_dir = output_dir / "slurm" + jobs: list[PlannedJob] = [] + for benchmark_name in benchmarks: + benchmark = BENCHMARKS[benchmark_name] + for policy_name in policies: + policy = POLICIES[policy_name] + num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS)) + run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}" + run_root = f"/benchmark-output/{run_rel}" + gradient_accumulation_steps = compute_gradient_accumulation_steps( + effective_batch_size=benchmark.effective_batch_size, + num_gpus=num_gpus, + microbatch_per_gpu=policy.microbatch_per_gpu, + ) + train_dir = f"{run_root}/train" + checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model" + eval_dir = f"{run_root}/eval" + train_args = build_train_args( + benchmark=benchmark, + policy=policy, + train_dir=train_dir, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + eval_args = build_eval_args( + benchmark=benchmark, + policy=policy, + checkpoint_path=checkpoint_path, + eval_dir=eval_dir, + ) + tokenizer_args = None + if policy.needs_tokenizer: + tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer" + tokenizer_args = { + "repo_id": benchmark.dataset_repo_id, + "output_dir": f"{run_root}/tokenizer", + "hub_repo_id": tokenizer_repo_id, + **policy.tokenizer_args, + } + train_args["policy.action_tokenizer_name"] = tokenizer_repo_id + script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch") + jobs.append( + PlannedJob( + benchmark=benchmark_name, + policy=policy_name, + run_rel=run_rel, + num_gpus=num_gpus, + microbatch_per_gpu=policy.microbatch_per_gpu, + gradient_accumulation_steps=gradient_accumulation_steps, + effective_batch_size=benchmark.effective_batch_size, + docker_image=benchmark.docker_image, + train_args=train_args, + eval_args=eval_args, + tokenizer_args=tokenizer_args, + script_path=script_path, + ) + ) + return jobs + + +def render_sbatch_script( + *, + job: PlannedJob, + output_dir: Path, + results_repo_id: str, + git_commit: str, +) -> str: + host_output_dir = output_dir.resolve() + run_root = f"/benchmark-output/{job.run_rel}" + host_run_root = host_output_dir / job.run_rel + cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus) + mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus) + gpu_ids_expr = "${GPU_IDS}" + train_cli = format_cli_args(job.train_args) + eval_cli = format_cli_args(job.eval_args) + tokenizer_command = "" + if job.tokenizer_args: + tokenizer_cli = format_cli_args(job.tokenizer_args) + tokenizer_command = f""" +docker run --rm --gpus all \\ + --shm-size=16g \\ + -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\ + -e HF_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_HOME=/tmp/hf \\ + -v "{host_output_dir}:/benchmark-output" \\ + -w /lerobot \\ + "{job.docker_image}" \\ + bash -lc ' + set -euo pipefail + if [[ -n "${{HF_TOKEN:-}}" ]]; then + hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true + fi + lerobot-train-tokenizer \\ +{tokenizer_cli} + ' +""" + return f"""#!/bin/bash +#SBATCH --job-name=bench-{job.benchmark}-{job.policy} +#SBATCH --gres=gpu:{job.num_gpus} +#SBATCH --cpus-per-task={cpus_per_task} +#SBATCH --mem={mem_gb}G +#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out +#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err + +set -euo pipefail + +HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}" +GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))" +RUN_ROOT="{run_root}" + +mkdir -p "{host_output_dir}/logs" +mkdir -p "{host_run_root.parent}" + +{tokenizer_command} + +TRAIN_START="$(date +%s)" +docker run --rm --gpus all \\ + --shm-size=16g \\ + -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\ + -e HF_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_HOME=/tmp/hf \\ + -v "{host_output_dir}:/benchmark-output" \\ + -w /lerobot \\ + "{job.docker_image}" \\ + bash -lc ' + set -euo pipefail + if [[ -n "${{HF_TOKEN:-}}" ]]; then + hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true + fi + accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\ +{train_cli} + ' +TRAIN_END="$(date +%s)" + +EVAL_START="$(date +%s)" +docker run --rm --gpus all \\ + --shm-size=16g \\ + -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\ + -e HF_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_HOME=/tmp/hf \\ + -v "{host_output_dir}:/benchmark-output" \\ + -w /lerobot \\ + "{job.docker_image}" \\ + bash -lc ' + set -euo pipefail + if [[ -n "${{HF_TOKEN:-}}" ]]; then + hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true + fi + lerobot-eval \\ +{eval_cli} + ' +EVAL_END="$(date +%s)" +TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))" +EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))" + +docker run --rm --gpus all \\ + --shm-size=16g \\ + -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\ + -e HF_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\ + -e HF_HOME=/tmp/hf \\ + -e RUN_ROOT="${{RUN_ROOT}}" \\ + -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\ + -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\ + -v "{host_output_dir}:/benchmark-output" \\ + -w /lerobot \\ + "{job.docker_image}" \\ + bash -lc ' + set -euo pipefail + if [[ -n "${{HF_TOKEN:-}}" ]]; then + hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true + fi + uv run python benchmarks/publish_benchmark_result.py \\ + --benchmark={job.benchmark} \\ + --policy={job.policy} \\ + --run_root="${{RUN_ROOT}}" \\ + --results_repo={results_repo_id} \\ + --git_commit={git_commit} \\ + --num_gpus={job.num_gpus} \\ + --microbatch_per_gpu={job.microbatch_per_gpu} \\ + --gradient_accumulation_steps={job.gradient_accumulation_steps} \\ + --effective_batch_size={job.effective_batch_size} \\ + --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\ + --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\ + --slurm_job_id="${{SLURM_JOB_ID:-}}" \\ + --docker_image={job.docker_image} + ' +""" + + +def write_manifest( + *, + output_dir: Path, + jobs: list[PlannedJob], + git_commit: str, + hub_org: str, + results_repo: str, +) -> Path: + manifest = { + "generated_at": datetime.now(UTC).isoformat(), + "git_commit": git_commit, + "hub_org": hub_org, + "results_repo": results_repo, + "jobs": [asdict(job) for job in jobs], + } + manifest_path = output_dir / "manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) + return manifest_path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--policies", nargs="*", default=None) + parser.add_argument("--benchmarks", nargs="*", default=None) + parser.add_argument("--output_dir", required=True, type=Path) + parser.add_argument("--hub_org", required=True) + parser.add_argument("--results_repo", required=True) + parser.add_argument("--submit", action="store_true") + return parser.parse_args() + + +def get_git_commit() -> str: + return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip() + + +def main() -> int: + args = parse_args() + args.output_dir.mkdir(parents=True, exist_ok=True) + (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True) + (args.output_dir / "logs").mkdir(parents=True, exist_ok=True) + + selected_policies = get_requested_names(args.policies, POLICIES, kind="policies") + selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks") + git_commit = get_git_commit() + results_repo_id = normalize_repo_id(args.hub_org, args.results_repo) + + jobs = plan_jobs( + output_dir=args.output_dir, + hub_org=args.hub_org, + results_repo=results_repo_id, + policies=selected_policies, + benchmarks=selected_benchmarks, + ) + + for job in jobs: + script = render_sbatch_script( + job=job, + output_dir=args.output_dir, + results_repo_id=results_repo_id, + git_commit=git_commit, + ) + script_path = Path(job.script_path) + script_path.write_text(script) + script_path.chmod(0o755) + if args.submit: + subprocess.run(["sbatch", str(script_path)], check=True) + + manifest_path = write_manifest( + output_dir=args.output_dir, + jobs=jobs, + git_commit=git_commit, + hub_org=args.hub_org, + results_repo=results_repo_id, + ) + print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}") + print(f"Manifest: {manifest_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docker/Dockerfile.benchmark.libero_plus b/docker/Dockerfile.benchmark.libero_plus new file mode 100644 index 000000000..4b6cbf1da --- /dev/null +++ b/docker/Dockerfile.benchmark.libero_plus @@ -0,0 +1,48 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM huggingface/lerobot-gpu:latest + +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + unzip libexpat1 libfontconfig1-dev libmagickwand-dev \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +RUN uv pip install --no-cache \ + "robosuite==1.4.1" bddl easydict mujoco matplotlib wand scikit-image gym + +ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero +RUN git clone --depth=1 https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \ + && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \ + && uv pip uninstall hf-libero 2>/dev/null || true +ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}" + +RUN python -c "\ +from huggingface_hub import hf_hub_download; \ +hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \ + filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \ + && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \ + && mv /tmp/libero-plus-dl/extract/inspire/hdd/project/embodied-multimodality/public/syfei/libero_new/release/dataset/LIBERO-plus-0/assets \ + ${LIBERO_PLUS_ROOT}/assets \ + && rm -rf /tmp/libero-plus-dl + +RUN mkdir -p /home/user_lerobot/.libero \ + && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robomme b/docker/Dockerfile.benchmark.robomme new file mode 100644 index 000000000..4fd3ec409 --- /dev/null +++ b/docker/Dockerfile.benchmark.robomme @@ -0,0 +1,39 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM huggingface/lerobot-gpu:latest + +ENV NVIDIA_DRIVER_CAPABILITIES=all \ + VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json + +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libvulkan1 libvulkan-dev mesa-vulkan-drivers \ + && mkdir -p /usr/share/vulkan/icd.d \ + && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \ + > /usr/share/vulkan/icd.d/nvidia_icd.json \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \ + && uv pip install --no-cache --override /tmp/robomme_override.txt \ + -e ".[smolvla,av-dep]" \ + "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \ + && python -c "import robomme; print('robomme import OK')" + +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 5fbc1c35a..75f41d73c 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -31,10 +31,22 @@ from __future__ import annotations import argparse import json +import re import sys from pathlib import Path +# LIBERO-plus derives task.language by space-joining the perturbation-variant +# filename, so strip the perturbation metadata blob to recover the base prompt. +_LIBERO_PERTURBATION_TAIL_RE = re.compile( + r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$" +) + + +def _strip_libero_perturbation_tail(instruction: str) -> str: + return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip() + + def _libero_descriptions(task_suite: str) -> dict[str, str]: from libero.libero import benchmark # type: ignore[import-untyped] @@ -47,7 +59,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]: ) return {} suite = suite_dict[task_suite]() - return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)} + return { + f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language) + for i in range(suite.n_tasks) + } def _metaworld_descriptions(task_name: str) -> dict[str, str]: @@ -57,6 +72,14 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]: return {f"{task_name}_0": label} +def _robomme_descriptions(task_names: str) -> dict[str, str]: + return { + f"{task_name}_0": task_name.replace("_", " ").strip() + for task_name in (task.strip() for task in task_names.split(",")) + if task_name + } + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") @@ -66,10 +89,12 @@ def main() -> int: descriptions: dict[str, str] = {} try: - if args.env == "libero": + if args.env in {"libero", "libero_plus"}: descriptions = _libero_descriptions(args.task) elif args.env == "metaworld": descriptions = _metaworld_descriptions(args.task) + elif args.env == "robomme": + descriptions = _robomme_descriptions(args.task) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.", diff --git a/spaces/benchmark-leaderboard/README.md b/spaces/benchmark-leaderboard/README.md new file mode 100644 index 000000000..a01ce3d01 --- /dev/null +++ b/spaces/benchmark-leaderboard/README.md @@ -0,0 +1,27 @@ +--- +title: LeRobot Benchmark Leaderboard +emoji: 🤖 +colorFrom: yellow +colorTo: orange +sdk: gradio +sdk_version: 5.29.0 +app_file: app.py +pinned: false +license: apache-2.0 +short_description: Benchmark history for LeRobot policy x benchmark runs +--- + +# LeRobot Benchmark Leaderboard + +This Space reads immutable benchmark rows from a Hugging Face dataset and shows: + +- Latest result per policy and benchmark +- Historical trends over time +- Direct links to uploaded eval and config artifacts + +## Configuration + +Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI +at a different public dataset. The default is: + +- `lerobot/benchmark-history` diff --git a/spaces/benchmark-leaderboard/app.py b/spaces/benchmark-leaderboard/app.py new file mode 100644 index 000000000..b5cb8271a --- /dev/null +++ b/spaces/benchmark-leaderboard/app.py @@ -0,0 +1,226 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +import os +import time +from pathlib import Path +from typing import Any + +import gradio as gr +import pandas as pd +import plotly.express as px +from huggingface_hub import HfApi, hf_hub_download + +RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history") +CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache") +CACHE_DIR.mkdir(parents=True, exist_ok=True) +CACHE_TTL_S = 300 + +_CACHE: dict[str, tuple[float, pd.DataFrame]] = {} + + +def _row_to_record(row: dict[str, Any]) -> dict[str, Any]: + overall = row.get("eval", {}).get("overall", {}) + resources = row.get("resources", {}) + timings = row.get("timings", {}) + artifact_urls = row.get("artifact_urls", {}) + return { + "created_at": row.get("created_at"), + "benchmark": row.get("benchmark"), + "policy": row.get("policy"), + "success_rate": overall.get("pc_success"), + "n_episodes": overall.get("n_episodes"), + "avg_sum_reward": overall.get("avg_sum_reward"), + "train_wall_time_s": timings.get("train_wall_time_s"), + "eval_wall_time_s": timings.get("eval_wall_time_s"), + "total_wall_time_s": timings.get("total_wall_time_s"), + "num_gpus": resources.get("num_gpus"), + "microbatch_per_gpu": resources.get("microbatch_per_gpu"), + "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"), + "effective_batch_size": resources.get("effective_batch_size"), + "git_commit": row.get("git_commit"), + "row_url": artifact_urls.get("row"), + "eval_info_url": artifact_urls.get("eval_info"), + "train_config_url": artifact_urls.get("train_config"), + } + + +def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame: + cache_key = f"rows::{repo_id}" + cached = _CACHE.get(cache_key) + if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S: + return cached[1] + + api = HfApi() + files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")] + records: list[dict[str, Any]] = [] + for path_in_repo in sorted(files, reverse=True): + local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR) + with open(local_path) as f: + row = json.load(f) + records.append(_row_to_record(row)) + + df = pd.DataFrame.from_records(records) + if not df.empty: + df["created_at"] = pd.to_datetime(df["created_at"], utc=True) + df = df.sort_values("created_at", ascending=False).reset_index(drop=True) + _CACHE[cache_key] = (time.monotonic(), df) + return df + + +def make_latest_table(df: pd.DataFrame) -> pd.DataFrame: + if df.empty: + return df + latest = ( + df.sort_values("created_at", ascending=False) + .groupby(["benchmark", "policy"], as_index=False) + .first() + .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last") + ) + return latest[ + [ + "benchmark", + "policy", + "success_rate", + "n_episodes", + "train_wall_time_s", + "eval_wall_time_s", + "num_gpus", + "effective_batch_size", + "git_commit", + "row_url", + "eval_info_url", + "train_config_url", + ] + ] + + +def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any: + filtered = df[df["benchmark"] == benchmark] + if policy and policy != "All": + filtered = filtered[filtered["policy"] == policy] + if filtered.empty: + return px.line(title="No benchmark rows found") + fig = px.line( + filtered.sort_values("created_at"), + x="created_at", + y="success_rate", + color="policy", + markers=True, + hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"], + title=f"{benchmark} success rate history", + ) + fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time") + return fig + + +def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str: + filtered = df[df["benchmark"] == benchmark] + if policy and policy != "All": + filtered = filtered[filtered["policy"] == policy] + if filtered.empty: + return "No matching runs yet." + latest = filtered.sort_values("created_at", ascending=False).iloc[0] + row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None + eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None + train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None + lines = [ + f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`", + f"Success rate: `{latest['success_rate']}`", + f"GPUs: `{latest['num_gpus']}`", + f"Effective batch size: `{latest['effective_batch_size']}`", + f"Commit: `{latest['git_commit']}`", + ] + if row_link: + lines.append(f"Row JSON: [open]({row_link})") + if eval_link: + lines.append(f"Eval Info: [open]({eval_link})") + if train_link: + lines.append(f"Train Config: [open]({train_link})") + return "\n\n".join(lines) + + +def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]: + df = load_rows() + latest_table = make_latest_table(df) + benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else [] + if benchmark not in benchmark_names and benchmark_names: + benchmark = benchmark_names[0] + policy_choices = ["All"] + if benchmark and not df.empty: + policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist())) + if policy not in policy_choices: + policy = "All" + history = make_history_figure(df, benchmark, policy) + summary = make_run_markdown(df, benchmark, policy) + return latest_table, gr.update(choices=policy_choices, value=policy), history, summary + + +with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo: + gr.Markdown( + f""" +# LeRobot Benchmark Leaderboard + +Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO}) +""" + ) + + with gr.Row(): + benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[]) + policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All") + refresh_button = gr.Button("Refresh") + + latest_table = gr.Dataframe(label="Latest Results", interactive=False) + history_plot = gr.Plot(label="History") + latest_summary = gr.Markdown() + + def _initial_state(): + df = load_rows() + benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else [] + benchmark = benchmarks[0] if benchmarks else "" + latest, policy_choices, history, summary = refresh_view(benchmark, "All") + return ( + gr.update(choices=benchmarks, value=benchmark), + policy_choices, + latest, + history, + summary, + ) + + demo.load( + _initial_state, + outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary], + ) + refresh_button.click( + refresh_view, + inputs=[benchmark_dropdown, policy_dropdown], + outputs=[latest_table, policy_dropdown, history_plot, latest_summary], + ) + benchmark_dropdown.change( + refresh_view, + inputs=[benchmark_dropdown, policy_dropdown], + outputs=[latest_table, policy_dropdown, history_plot, latest_summary], + ) + policy_dropdown.change( + refresh_view, + inputs=[benchmark_dropdown, policy_dropdown], + outputs=[latest_table, policy_dropdown, history_plot, latest_summary], + ) + + +if __name__ == "__main__": + demo.launch() diff --git a/spaces/benchmark-leaderboard/requirements.txt b/spaces/benchmark-leaderboard/requirements.txt new file mode 100644 index 000000000..7f528531a --- /dev/null +++ b/spaces/benchmark-leaderboard/requirements.txt @@ -0,0 +1,4 @@ +gradio>=5.0.0,<6.0.0 +plotly>=5.18.0 +pandas>=2.0.0 +huggingface-hub>=1.0.0,<2.0.0 diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index b05e96fde..6e4368206 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -67,11 +67,17 @@ class EvalConfig: # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv. # Set to 0 for auto-tuning based on available CPU cores and n_episodes. batch_size: int = 0 + # Number of rollout videos to save per evaluated task. Set to 0 to disable videos. + max_episodes_rendered: int = 10 # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing). # Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1. use_async_envs: bool = True def __post_init__(self) -> None: + if self.max_episodes_rendered < 0: + raise ValueError( + f"`max_episodes_rendered` must be non-negative, got {self.max_episodes_rendered}." + ) if self.batch_size == 0: self.batch_size = self._auto_batch_size() if self.batch_size > self.n_episodes: diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index d754a0847..16e8e381e 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -56,6 +56,7 @@ class TrainPipelineConfig(HubMixin): # Number of workers for the dataloader. num_workers: int = 4 batch_size: int = 8 + gradient_accumulation_steps: int = 1 steps: int = 100_000 eval_freq: int = 20_000 log_freq: int = 200 @@ -132,6 +133,11 @@ class TrainPipelineConfig(HubMixin): if isinstance(self.dataset.repo_id, list): raise NotImplementedError("LeRobotMultiDataset is not currently implemented.") + if self.gradient_accumulation_steps <= 0: + raise ValueError( + f"`gradient_accumulation_steps` must be strictly positive, got {self.gradient_accumulation_steps}." + ) + if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None): raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.") elif self.use_policy_training_preset and not self.resume: diff --git a/src/lerobot/envs/__init__.py b/src/lerobot/envs/__init__.py index 277fd04f4..459ebdc1b 100644 --- a/src/lerobot/envs/__init__.py +++ b/src/lerobot/envs/__init__.py @@ -18,7 +18,15 @@ # from lerobot.utils.import_utils import require_package # require_package("gymnasium", extra="", import_name="gymnasium") -from .configs import AlohaEnv, EnvConfig, HILSerlRobotEnvConfig, HubEnvConfig, PushtEnv +from .configs import ( + AlohaEnv, + EnvConfig, + HILSerlRobotEnvConfig, + HubEnvConfig, + LiberoPlusEnv, + PushtEnv, + RoboMMEEnv, +) from .factory import make_env, make_env_config, make_env_pre_post_processors from .utils import check_env_attributes_and_types, close_envs, env_to_policy_features, preprocess_observation @@ -27,7 +35,9 @@ __all__ = [ "EnvConfig", "HILSerlRobotEnvConfig", "HubEnvConfig", + "LiberoPlusEnv", "PushtEnv", + "RoboMMEEnv", "check_env_attributes_and_types", "close_envs", "env_to_policy_features", diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 2a7c52d45..a5a38279e 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -574,3 +574,58 @@ class IsaaclabArenaEnv(HubEnvConfig): ), PolicyProcessorPipeline(steps=[]), ) + + +@EnvConfig.register_subclass("libero_plus") +@dataclass +class LiberoPlusEnv(LiberoEnv): + """Config for LIBERO-plus robustness benchmark evaluation.""" + + task: str = "libero_spatial" + + +@EnvConfig.register_subclass("robomme") +@dataclass +class RoboMMEEnv(EnvConfig): + """RoboMME memory-augmented manipulation benchmark.""" + + task: str = "PickXtimes" + fps: int = 10 + episode_length: int = 300 + action_space: str = "joint_angle" + dataset_split: str = "test" + task_ids: list[int] | None = None + features: dict[str, PolicyFeature] = field( + default_factory=lambda: { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)), + "image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + "wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)), + } + ) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "image": f"{OBS_IMAGES}.image", + "wrist_image": f"{OBS_IMAGES}.wrist_image", + OBS_STATE: OBS_STATE, + } + ) + + @property + def gym_kwargs(self) -> dict: + return {} + + def create_envs(self, n_envs: int, use_async_envs: bool = True): + from .robomme import create_robomme_envs + + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robomme_envs( + task=self.task, + n_envs=n_envs, + action_space_type=self.action_space, + dataset=self.dataset_split, + episode_length=self.episode_length, + task_ids=self.task_ids, + env_cls=env_cls, + ) diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index ec90d0ffd..f67be5b91 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -16,6 +16,7 @@ from __future__ import annotations import os +import re from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial @@ -69,14 +70,28 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i return ids +# LIBERO-plus perturbation variants encode the perturbation in the filename +# but on disk only the base `.pruned_init` exists — strip the suffix to match +# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we +# can pass weights_only=False for PyTorch 2.6+ numpy pickles). +_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+") + + def get_task_init_states(task_suite: Any, i: int) -> np.ndarray: - init_states_path = ( - Path(get_libero_path("init_states")) - / task_suite.tasks[i].problem_folder - / task_suite.tasks[i].init_states_file - ) - init_states = torch.load(init_states_path, weights_only=False) # nosec B614 - return init_states + task = task_suite.tasks[i] + filename = Path(task.init_states_file) + root = Path(get_libero_path("init_states")) + + # `_add_` / `_level` variants store extra-object layouts under libero_newobj/ + # as a flat array that must be reshaped to (1, -1). + if "_add_" in filename.name or "_level" in filename.name: + init_states_path = root / "libero_newobj" / task.problem_folder / filename.name + init_states = torch.load(init_states_path, weights_only=False) # nosec B614 + return init_states.reshape(1, -1) + + stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix + init_states_path = root / task.problem_folder / stripped + return torch.load(init_states_path, weights_only=False) # nosec B614 def get_libero_dummy_action(): diff --git a/src/lerobot/envs/robomme.py b/src/lerobot/envs/robomme.py new file mode 100644 index 000000000..8babb4c30 --- /dev/null +++ b/src/lerobot/envs/robomme.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RoboMME environment wrapper for LeRobot evaluation.""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +ROBOMME_TASKS = [ + "BinFill", + "PickXtimes", + "SwingXtimes", + "StopCube", + "VideoUnmask", + "VideoUnmaskSwap", + "ButtonUnmask", + "ButtonUnmaskSwap", + "PickHighlight", + "VideoRepick", + "VideoPlaceButton", + "VideoPlaceOrder", + "MoveCube", + "InsertPeg", + "PatternLock", + "RouteStick", +] + + +class RoboMMEGymEnv(gym.Env): + """Thin Gymnasium wrapper around a single RoboMME episode env.""" + + metadata = {"render_modes": ["rgb_array"], "render_fps": 10} + + def __init__( + self, + task: str = "PickXtimes", + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_idx: int = 0, + max_steps: int = 300, + ): + super().__init__() + from robomme.env_record_wrapper import BenchmarkEnvBuilder + + self._builder = BenchmarkEnvBuilder( + env_id=task, + dataset=dataset, + action_space=action_space_type, + gui_render=False, + max_steps=max_steps, + ) + self._max_episode_steps = max_steps + self._episode_idx = episode_idx + self._max_steps = max_steps + self._env = None + self._last_raw_obs: dict | None = None + + action_dim = 8 if action_space_type == "joint_angle" else 7 + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32) + self.observation_space = spaces.Dict( + { + "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + "state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32), + } + ) + + def reset(self, *, seed=None, options=None): + super().reset(seed=seed) + self._env = self._builder.make_env_for_episode( + episode_idx=self._episode_idx, + max_steps=self._max_steps, + ) + obs, info = self._env.reset() + self._last_raw_obs = obs + return self._convert_obs(obs), self._convert_info(info) + + def step(self, action): + obs, reward, terminated, truncated, info = self._env.step(action) + self._last_raw_obs = obs + + terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated) + truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated) + + status = info.get("status", "ongoing") + conv_info = self._convert_info(info) + conv_info["is_success"] = status == "success" + + return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info + + def render(self) -> np.ndarray | None: + if self._last_raw_obs is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + front = self._last_raw_obs.get("front_rgb_list") + if front is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + frame = front[-1] if isinstance(front, list) else front + return np.asarray(frame, dtype=np.uint8) + + def _convert_obs(self, obs: dict) -> dict: + front_rgb = ( + obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"] + ) + wrist_rgb = ( + obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"] + ) + joint_state = ( + obs["joint_state_list"][-1] + if isinstance(obs["joint_state_list"], list) + else obs["joint_state_list"] + ) + gripper_state = ( + obs["gripper_state_list"][-1] + if isinstance(obs["gripper_state_list"], list) + else obs["gripper_state_list"] + ) + + joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7] + gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1] + state = np.concatenate([joint, gripper]) + + return { + "image": np.asarray(front_rgb, dtype=np.uint8), + "wrist_image": np.asarray(wrist_rgb, dtype=np.uint8), + "state": state, + } + + def _convert_info(self, info: dict) -> dict: + return { + "status": info.get("status", "ongoing"), + "task_goal": info.get("task_goal", ""), + } + + +def _make_env_fns( + *, + task: str, + n_envs: int, + action_space_type: str, + dataset: str, + episode_length: int, + task_id: int, +) -> list[Callable[[], RoboMMEGymEnv]]: + def _make_one(episode_index: int) -> RoboMMEGymEnv: + return RoboMMEGymEnv( + task=task, + action_space_type=action_space_type, + dataset=dataset, + episode_idx=episode_index, + max_steps=episode_length, + ) + + return [partial(_make_one, task_id + i) for i in range(n_envs)] + + +def create_robomme_envs( + task: str, + n_envs: int = 1, + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_length: int = 300, + task_ids: list[int] | None = None, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, +) -> dict[str, dict[int, gym.vector.VectorEnv]]: + """Create vectorized RoboMME environments for evaluation.""" + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of env factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + if task_ids is None: + task_ids = [0] + + task_names = [t.strip() for t in task.split(",") if t.strip()] + out: dict[str, dict[int, gym.vector.VectorEnv]] = {} + for task_name in task_names: + envs_by_task: dict[int, gym.vector.VectorEnv] = {} + for task_id in task_ids: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + action_space_type=action_space_type, + dataset=dataset, + episode_length=episode_length, + task_id=task_id, + ) + envs_by_task[task_id] = env_cls(fns) + out[task_name] = envs_by_task + return out diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index d45483d21..3378a8cbd 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -572,7 +572,7 @@ def eval_main(cfg: EvalPipelineConfig): preprocessor=preprocessor, postprocessor=postprocessor, n_episodes=cfg.eval.n_episodes, - max_episodes_rendered=10, + max_episodes_rendered=cfg.eval.max_episodes_rendered, videos_dir=Path(cfg.output_dir) / "videos", start_seed=cfg.seed, max_parallel_tasks=cfg.env.max_parallel_tasks, diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index a862c640d..d31e45233 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -71,6 +71,9 @@ def update_policy( lr_scheduler=None, lock=None, rabc_weights_provider=None, + *, + do_optimizer_step: bool = True, + loss_divisor: int = 1, ) -> tuple[MetricsTracker, dict]: """ Performs a single training step to update the policy's weights. @@ -122,34 +125,38 @@ def update_policy( loss, output_dict = policy.forward(batch) # TODO(rcadene): policy.unnormalize_outputs(out_dict) + logged_loss = loss.detach() + if loss_divisor > 1: + loss = loss / loss_divisor # Use accelerator's backward method accelerator.backward(loss) - # Clip gradients if specified - if grad_clip_norm > 0: - grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) - else: - grad_norm = torch.nn.utils.clip_grad_norm_( - policy.parameters(), float("inf"), error_if_nonfinite=False - ) + grad_norm_value = 0.0 + if do_optimizer_step: + if grad_clip_norm > 0: + grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) + else: + grad_norm = torch.nn.utils.clip_grad_norm_( + policy.parameters(), float("inf"), error_if_nonfinite=False + ) + grad_norm_value = grad_norm.item() - # Optimizer step - with lock if lock is not None else nullcontext(): - optimizer.step() + with lock if lock is not None else nullcontext(): + optimizer.step() - optimizer.zero_grad() + optimizer.zero_grad() - # Step through pytorch scheduler at every batch instead of epoch - if lr_scheduler is not None: - lr_scheduler.step() + # Step through pytorch scheduler at every optimizer step instead of epoch + if lr_scheduler is not None: + lr_scheduler.step() - # Update internal buffers if policy has update method - if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"): - accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update() + # Update internal buffers if policy has update method + if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"): + accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update() - train_metrics.loss = loss.item() - train_metrics.grad_norm = grad_norm.item() + train_metrics.loss = logged_loss.item() + train_metrics.grad_norm = grad_norm_value train_metrics.lr = optimizer.param_groups[0]["lr"] train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict @@ -359,8 +366,16 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") logging.info(f"{dataset.num_episodes=}") num_processes = accelerator.num_processes - effective_bs = cfg.batch_size * num_processes - logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}") + micro_batch = cfg.batch_size + logical_batch = cfg.batch_size * cfg.gradient_accumulation_steps + effective_bs = logical_batch * num_processes + logging.info( + "Effective batch size: %s x %s x %s = %s", + micro_batch, + cfg.gradient_accumulation_steps, + num_processes, + effective_bs, + ) logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") @@ -407,9 +422,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): } # Keep global batch size for logging; MetricsTracker handles world size internally. - effective_batch_size = cfg.batch_size * accelerator.num_processes + logical_batch_size = cfg.batch_size * cfg.gradient_accumulation_steps + effective_batch_size = logical_batch_size * accelerator.num_processes train_tracker = MetricsTracker( - cfg.batch_size, + logical_batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, @@ -431,21 +447,62 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): ) for _ in range(step, cfg.steps): - start_time = time.perf_counter() - batch = next(dl_iter) - batch = preprocessor(batch) - train_tracker.dataloading_s = time.perf_counter() - start_time + step_dataloading_s = 0.0 + step_update_s = 0.0 + step_losses = [] + step_grad_norm = 0.0 + step_lr = optimizer.param_groups[0]["lr"] + output_dict = {} + optimizer.zero_grad() + for accumulation_idx in range(cfg.gradient_accumulation_steps): + start_time = time.perf_counter() + batch = next(dl_iter) + batch = preprocessor(batch) + step_dataloading_s += time.perf_counter() - start_time - train_tracker, output_dict = update_policy( - train_tracker, - policy, - batch, - optimizer, - cfg.optimizer.grad_clip_norm, - accelerator=accelerator, - lr_scheduler=lr_scheduler, - rabc_weights_provider=rabc_weights, - ) + is_last_microbatch = accumulation_idx == cfg.gradient_accumulation_steps - 1 + micro_metrics = MetricsTracker( + cfg.batch_size, + dataset.num_frames, + dataset.num_episodes, + { + "loss": AverageMeter("loss", ":.3f"), + "grad_norm": AverageMeter("grdn", ":.3f"), + "lr": AverageMeter("lr", ":0.1e"), + "update_s": AverageMeter("updt_s", ":.3f"), + }, + accelerator=accelerator, + ) + sync_context = ( + nullcontext() + if is_last_microbatch or accelerator.num_processes == 1 + else accelerator.no_sync(policy) + ) + with sync_context: + micro_metrics, micro_output_dict = update_policy( + micro_metrics, + policy, + batch, + optimizer, + cfg.optimizer.grad_clip_norm, + accelerator=accelerator, + lr_scheduler=lr_scheduler if is_last_microbatch else None, + rabc_weights_provider=rabc_weights, + do_optimizer_step=is_last_microbatch, + loss_divisor=cfg.gradient_accumulation_steps, + ) + step_update_s += micro_metrics.update_s.val + step_losses.append(micro_metrics.loss.val) + if is_last_microbatch: + step_grad_norm = micro_metrics.grad_norm.val + step_lr = micro_metrics.lr.val + output_dict = micro_output_dict + + train_tracker.loss = sum(step_losses) / len(step_losses) + train_tracker.grad_norm = step_grad_norm + train_tracker.lr = step_lr + train_tracker.update_s = step_update_s + train_tracker.dataloading_s = step_dataloading_s # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we # increment `step` here. @@ -510,7 +567,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): postprocessor=postprocessor, n_episodes=cfg.eval.n_episodes, videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, + max_episodes_rendered=cfg.eval.max_episodes_rendered, start_seed=cfg.seed, max_parallel_tasks=cfg.env.max_parallel_tasks, ) @@ -541,7 +598,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): if wandb_logger: wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} wandb_logger.log_dict(wandb_log_dict, step, mode="eval") - wandb_logger.log_video(eval_info["overall"]["video_paths"][0], step, mode="eval") + video_paths = eval_info["overall"].get("video_paths", []) + if video_paths: + wandb_logger.log_video(video_paths[0], step, mode="eval") accelerator.wait_for_everyone() diff --git a/src/lerobot/utils/history_repo.py b/src/lerobot/utils/history_repo.py new file mode 100644 index 000000000..8c8f82106 --- /dev/null +++ b/src/lerobot/utils/history_repo.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from huggingface_hub import HfApi + + +def utc_timestamp_slug(now: datetime | None = None) -> str: + current = now or datetime.now(UTC) + return current.strftime("%Y%m%dT%H%M%SZ") + + +def make_hub_file_url(repo_id: str, path_in_repo: str, repo_type: str = "dataset") -> str: + prefix = "datasets/" if repo_type == "dataset" else "" + return f"https://huggingface.co/{prefix}{repo_id}/resolve/main/{path_in_repo}" + + +def write_json(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True)) + + +@dataclass(frozen=True) +class UploadTarget: + local_path: Path + path_in_repo: str + + +def upload_targets( + repo_id: str, + targets: list[UploadTarget], + *, + repo_type: str = "dataset", + token: str | None = None, + private: bool | None = None, + commit_message: str | None = None, +) -> dict[str, str]: + api = HfApi(token=token) + api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True) + uploaded: dict[str, str] = {} + for target in targets: + api.upload_file( + path_or_fileobj=str(target.local_path), + path_in_repo=target.path_in_repo, + repo_id=repo_id, + repo_type=repo_type, + commit_message=commit_message or f"Upload {target.path_in_repo}", + ) + uploaded[target.path_in_repo] = make_hub_file_url(repo_id, target.path_in_repo, repo_type=repo_type) + return uploaded diff --git a/tests/benchmarks/test_benchmark_matrix.py b/tests/benchmarks/test_benchmark_matrix.py new file mode 100644 index 000000000..69e5e1e46 --- /dev/null +++ b/tests/benchmarks/test_benchmark_matrix.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +from benchmarks.run_benchmark_matrix import ( + PlannedJob, + compute_gradient_accumulation_steps, + plan_jobs, + render_sbatch_script, + write_manifest, +) + + +def _one_job(job_list: list[PlannedJob]) -> PlannedJob: + assert len(job_list) == 1 + return job_list[0] + + +def test_compute_gradient_accumulation_steps_for_fixed_effective_batch(): + assert compute_gradient_accumulation_steps( + effective_batch_size=256, + num_gpus=8, + microbatch_per_gpu=32, + ) == 1 + assert compute_gradient_accumulation_steps( + effective_batch_size=256, + num_gpus=4, + microbatch_per_gpu=32, + ) == 2 + assert compute_gradient_accumulation_steps( + effective_batch_size=256, + num_gpus=1, + microbatch_per_gpu=32, + ) == 8 + + +def test_plan_jobs_filters_libero_plus_only(tmp_path): + jobs = plan_jobs( + output_dir=tmp_path, + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + policies=["pi0", "act"], + benchmarks=["libero_plus"], + ) + + assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"] + assert [job.policy for job in jobs] == ["pi0", "act"] + + +def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path): + jobs = plan_jobs( + output_dir=tmp_path, + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + policies=["pi0"], + benchmarks=["libero_plus", "robomme"], + ) + + assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"] + assert jobs[0].effective_batch_size == 256 + assert jobs[1].effective_batch_size == 256 + + +def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path): + jobs = plan_jobs( + output_dir=tmp_path, + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + policies=["pi0", "xvla", "act"], + benchmarks=["robomme"], + ) + by_policy = {job.policy: job for job in jobs} + + assert by_policy["pi0"].num_gpus == 8 + assert by_policy["pi0"].gradient_accumulation_steps == 1 + assert by_policy["xvla"].num_gpus == 4 + assert by_policy["xvla"].gradient_accumulation_steps == 2 + assert by_policy["act"].num_gpus == 1 + assert by_policy["act"].gradient_accumulation_steps == 8 + + +def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path): + job = _one_job( + plan_jobs( + output_dir=tmp_path, + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + policies=["pi0_fast"], + benchmarks=["robomme"], + ) + ) + + script = render_sbatch_script( + job=job, + output_dir=tmp_path, + results_repo_id="lerobot/benchmark-history", + git_commit="deadbeef", + ) + + assert "docker/Dockerfile" not in script + assert "lerobot-benchmark-robomme:latest" in script + assert '--dataset.repo_id="lerobot/robomme"' in script + assert '--env.type="robomme"' in script + assert "--gradient_accumulation_steps=1" in script + assert "lerobot-train-tokenizer" in script + assert "benchmarks/publish_benchmark_result.py" in script + + +def test_write_manifest_records_job_metadata(tmp_path): + jobs = plan_jobs( + output_dir=tmp_path, + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + policies=["pi0"], + benchmarks=["libero_plus", "robomme"], + ) + manifest_path = write_manifest( + output_dir=tmp_path, + jobs=jobs, + git_commit="deadbeef", + hub_org="lerobot", + results_repo="lerobot/benchmark-history", + ) + + manifest = json.loads(manifest_path.read_text()) + assert manifest["git_commit"] == "deadbeef" + assert manifest["results_repo"] == "lerobot/benchmark-history" + assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"] diff --git a/tests/envs/test_robomme_env.py b/tests/envs/test_robomme_env.py new file mode 100644 index 000000000..452c2f101 --- /dev/null +++ b/tests/envs/test_robomme_env.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sys +from types import ModuleType +from unittest.mock import MagicMock + +import numpy as np + + +def _install_robomme_stub(): + stub = ModuleType("robomme") + wrapper_stub = ModuleType("robomme.env_record_wrapper") + + class FakeBuilder: + def __init__(self, **kwargs): + pass + + def make_env_for_episode(self, episode_idx: int, max_steps: int): + env = MagicMock() + obs = { + "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "joint_state_list": [np.zeros(7, dtype=np.float32)], + "gripper_state_list": [np.zeros(2, dtype=np.float32)], + } + env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"}) + env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""}) + return env + + wrapper_stub.BenchmarkEnvBuilder = FakeBuilder + stub.env_record_wrapper = wrapper_stub + sys.modules["robomme"] = stub + sys.modules["robomme.env_record_wrapper"] = wrapper_stub + + +def _uninstall_robomme_stub(): + sys.modules.pop("robomme", None) + sys.modules.pop("robomme.env_record_wrapper", None) + + +def test_robomme_env_config_defaults(): + from lerobot.envs.configs import RoboMMEEnv + + cfg = RoboMMEEnv() + assert cfg.task == "PickXtimes" + assert cfg.fps == 10 + assert cfg.episode_length == 300 + assert cfg.action_space == "joint_angle" + assert cfg.dataset_split == "test" + assert cfg.task_ids is None + + +def test_robomme_features_map(): + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE + + cfg = RoboMMEEnv() + assert cfg.features_map[ACTION] == ACTION + assert cfg.features_map["image"] == f"{OBS_IMAGES}.image" + assert cfg.features_map["wrist_image"] == f"{OBS_IMAGES}.wrist_image" + assert cfg.features_map[OBS_STATE] == OBS_STATE + + +def test_convert_obs_list_format(): + _install_robomme_stub() + try: + from lerobot.envs.robomme import RoboMMEGymEnv + + env = RoboMMEGymEnv.__new__(RoboMMEGymEnv) + + front = np.full((256, 256, 3), 42, dtype=np.uint8) + wrist = np.full((256, 256, 3), 7, dtype=np.uint8) + joints = np.arange(7, dtype=np.float32) + gripper = np.array([0.5, 0.5], dtype=np.float32) + + obs_raw = { + "front_rgb_list": [np.zeros_like(front), front], + "wrist_rgb_list": [np.zeros_like(wrist), wrist], + "joint_state_list": [np.zeros(7, dtype=np.float32), joints], + "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper], + } + + result = env._convert_obs(obs_raw) + np.testing.assert_array_equal(result["image"], front) + np.testing.assert_array_equal(result["wrist_image"], wrist) + assert result["state"].shape == (8,) + np.testing.assert_array_almost_equal(result["state"][:7], joints) + assert result["state"][7] == gripper[0] + finally: + _uninstall_robomme_stub() + + +def test_create_robomme_envs_multi_task(): + _install_robomme_stub() + try: + from lerobot.envs.robomme import create_robomme_envs + + env_cls = MagicMock(return_value=MagicMock()) + result = create_robomme_envs( + task="PickXtimes,BinFill,StopCube", + n_envs=1, + env_cls=env_cls, + ) + + assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"} + finally: + _uninstall_robomme_stub()