diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml new file mode 100644 index 000000000..79d5614b2 --- /dev/null +++ b/.github/workflows/benchmark_tests.yml @@ -0,0 +1,312 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Integration tests: build an isolated Docker image per benchmark and run a +# 1-episode smoke eval. Each benchmark gets its own image so incompatible +# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide. +# +# To add a new benchmark: +# 1. Add docker/Dockerfile.benchmark. (install only lerobot[]) +# 2. Copy one of the jobs below and adjust the image name and eval command. +name: Benchmark Integration Tests + +on: + # Run manually from the Actions tab + workflow_dispatch: + + # Run every Monday at 02:00 UTC. + schedule: + - cron: "0 2 * * 1" + + push: + branches: + - main + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + + pull_request: + branches: + - main + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + +permissions: + contents: read + +env: + UV_VERSION: "0.8.0" + PYTHON_VERSION: "3.12" + +# Cancel in-flight runs for the same branch/PR. +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + # ── LIBERO ──────────────────────────────────────────────────────────────── + # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain) + libero-integration-test: + name: Libero — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + + # Build the benchmark-specific image. The Dockerfile separates dep-install + # from source-copy, so code-only changes skip the slow uv-sync layer + # when the runner has a warm Docker daemon cache. + - name: Build Libero benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero + push: false + load: true + tags: lerobot-benchmark-libero:ci + + - name: Run Libero smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + # Named container (no --rm) so we can docker cp artifacts out. + # Output to /tmp inside the container — /artifacts doesn't exist + # and user_lerobot cannot create root-level dirs. + docker run --name libero-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-libero:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_libero \ + --env.type=libero \ + --env.task=libero_spatial \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero --task libero_spatial \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy Libero artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-artifacts + docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true + docker rm -f libero-eval || true + + - name: Parse Libero eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-artifacts \ + --env libero \ + --task libero_spatial \ + --policy pepijn223/smolvla_libero + + - name: Upload Libero rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-rollout-video + path: /tmp/libero-artifacts/videos/ + if-no-files-found: warn + + - name: Upload Libero eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-metrics + path: /tmp/libero-artifacts/metrics.json + if-no-files-found: warn + + # ── LIBERO TRAIN+EVAL SMOKE ────────────────────────────────────────────── + # Train SmolVLA for 1 step (batch_size=1, dataset episode 0 only) then + # immediately runs eval inside the training loop (eval_freq=1, 1 episode). + # Tests the full train→eval-within-training pipeline end-to-end. + - name: Run Libero train+eval smoke (1 step, eval_freq=1) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name libero-train-smoke --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-libero:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + accelerate launch --num_processes=1 \$(which lerobot-train) \ + --policy.path=lerobot/smolvla_base \ + --policy.load_vlm_weights=true \ + --policy.scheduler_decay_steps=25000 \ + --policy.freeze_vision_encoder=false \ + --policy.train_expert_only=false \ + --dataset.repo_id=lerobot/libero \ + --dataset.episodes=[0] \ + --dataset.use_imagenet_stats=false \ + --env.type=libero \ + --env.task=libero_spatial \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/train-smoke \ + --steps=1 \ + --batch_size=1 \ + --eval_freq=1 \ + --eval.n_episodes=1 \ + --eval.batch_size=1 \ + --eval.use_async_envs=false \ + --save_freq=1 \ + --policy.push_to_hub=false \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.image2\": \"observation.images.camera2\"}' + " + + - name: Copy Libero train-smoke artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-train-smoke-artifacts + docker cp libero-train-smoke:/tmp/train-smoke/. /tmp/libero-train-smoke-artifacts/ 2>/dev/null || true + docker rm -f libero-train-smoke || true + + - name: Upload Libero train-smoke eval video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-train-smoke-video + path: /tmp/libero-train-smoke-artifacts/eval/ + if-no-files-found: warn + + # ── METAWORLD ───────────────────────────────────────────────────────────── + # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) + metaworld-integration-test: + name: MetaWorld — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + + - name: Build MetaWorld benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.metaworld + push: false + load: true + tags: lerobot-benchmark-metaworld:ci + + - name: Run MetaWorld smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name metaworld-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + lerobot-benchmark-metaworld:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_metaworld \ + --env.type=metaworld \ + --env.task=metaworld-push-v3 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.image\": \"observation.images.camera1\"}' \ + --policy.empty_cameras=2 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env metaworld --task metaworld-push-v3 \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy MetaWorld artifacts from container + if: always() + run: | + mkdir -p /tmp/metaworld-artifacts + docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true + docker rm -f metaworld-eval || true + + - name: Parse MetaWorld eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/metaworld-artifacts \ + --env metaworld \ + --task metaworld-push-v3 \ + --policy pepijn223/smolvla_metaworld + + - name: Upload MetaWorld rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: metaworld-rollout-video + path: /tmp/metaworld-artifacts/videos/ + if-no-files-found: warn + + - name: Upload MetaWorld eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: metaworld-metrics + path: /tmp/metaworld-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero new file mode 100644 index 000000000..620088b8b --- /dev/null +++ b/docker/Dockerfile.benchmark.libero @@ -0,0 +1,42 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for LIBERO integration tests. +# Extends the nightly GPU image (which already has all extras installed) +# with the PR's source code and LIBERO-specific asset setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero . +# Run: docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at +# runtime (which times out on CI). Point the libero config at the cached path. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing, +# so we write the config before any libero import can happen. +RUN LIBERO_DIR=$(python -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + python -c "\ +from huggingface_hub import snapshot_download; \ +snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ + local_dir='/home/user_lerobot/.libero/assets')" && \ + printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld new file mode 100644 index 000000000..96d9e89f9 --- /dev/null +++ b/docker/Dockerfile.benchmark.metaworld @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for MetaWorld integration tests. +# Extends the nightly GPU image (which already has all extras installed) +# with the PR's source code. +# +# Build: docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld . +# Run: docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py new file mode 100644 index 000000000..5fbc1c35a --- /dev/null +++ b/scripts/ci/extract_task_descriptions.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Extract natural-language task descriptions for a benchmark suite. + +Runs inside the benchmark Docker container (where the env library is installed) +immediately after lerobot-eval, writing a JSON file that parse_eval_metrics.py +picks up and embeds in metrics.json. + +Output format: {"_": "", ...} + +Usage: + python scripts/ci/extract_task_descriptions.py \\ + --env libero --task libero_spatial \\ + --output /tmp/eval-artifacts/task_descriptions.json +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +def _libero_descriptions(task_suite: str) -> dict[str, str]: + from libero.libero import benchmark # type: ignore[import-untyped] + + suite_dict = benchmark.get_benchmark_dict() + if task_suite not in suite_dict: + print( + f"[extract_task_descriptions] Unknown LIBERO suite '{task_suite}'. " + f"Available: {list(suite_dict.keys())}", + file=sys.stderr, + ) + return {} + suite = suite_dict[task_suite]() + return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)} + + +def _metaworld_descriptions(task_name: str) -> dict[str, str]: + # MetaWorld tasks don't expose a separate NL description attribute; + # use a cleaned version of the task name as the description. + label = task_name.removeprefix("metaworld-").replace("-", " ").strip() + return {f"{task_name}_0": label} + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") + parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)") + parser.add_argument("--output", required=True, help="Path to write task_descriptions.json") + args = parser.parse_args() + + descriptions: dict[str, str] = {} + try: + if args.env == "libero": + descriptions = _libero_descriptions(args.task) + elif args.env == "metaworld": + descriptions = _metaworld_descriptions(args.task) + else: + print( + f"[extract_task_descriptions] No description extractor for env '{args.env}'.", + file=sys.stderr, + ) + except Exception as exc: + print(f"[extract_task_descriptions] Warning: {exc}", file=sys.stderr) + + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(descriptions, indent=2)) + print(f"[extract_task_descriptions] {len(descriptions)} descriptions → {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/ci/parse_eval_metrics.py b/scripts/ci/parse_eval_metrics.py new file mode 100644 index 000000000..897d9e81b --- /dev/null +++ b/scripts/ci/parse_eval_metrics.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parse lerobot-eval output into a small metrics.json artifact. + +Reads eval_info.json written by lerobot-eval --output_dir and extracts the +key metrics needed by the health dashboard. Handles both single-task and +multi-task eval output formats. + +NOTE: This script runs on the bare CI runner (not inside Docker), so it +must use only Python stdlib modules. Do not add third-party imports. + +Usage: + python scripts/ci/parse_eval_metrics.py \\ + --artifacts-dir /tmp/libero-artifacts \\ + --env libero \\ + --task libero_spatial \\ + --policy pepijn223/smolvla_libero + +Writes /metrics.json. The CI workflow then uploads this file +as a GitHub Actions artifact named "-metrics". +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path + + +def _safe_float(v: float | int | None) -> float | None: + if v is None: + return None + f = float(v) + return None if math.isnan(f) else f + + +def _safe_int(v: float | int | None) -> int | None: + if v is None: + return None + f = float(v) + return None if math.isnan(f) else int(f) + + +def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]: + """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json. + + Handles two output shapes: + - Single-task: {"aggregated": {"pc_success": 80.0, ...}} + - Multi-task: {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}} + """ + for key in ("aggregated", "overall"): + if key not in info: + continue + agg = info[key] + pc = agg.get("pc_success") + n = agg.get("n_episodes") + reward = agg.get("avg_sum_reward") + eval_s = agg.get("eval_s") + + if pc is not None and not math.isnan(pc): + return ( + float(pc), + _safe_int(n), + _safe_float(reward), + _safe_float(eval_s), + ) + + return None, None, None, None + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume") + parser.add_argument("--env", required=True, help="Environment name (e.g. libero)") + parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)") + parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)") + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + eval_info_path = artifacts_dir / "eval_info.json" + + pc_success: float | None = None + n_episodes: int | None = None + avg_sum_reward: float | None = None + eval_s: float | None = None + + if eval_info_path.exists(): + try: + info = json.loads(eval_info_path.read_text()) + pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info) + except (json.JSONDecodeError, KeyError, TypeError) as exc: + print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr) + else: + print( + f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.", + file=sys.stderr, + ) + + task_descriptions: dict[str, str] = {} + task_desc_path = artifacts_dir / "task_descriptions.json" + if task_desc_path.exists(): + try: + task_descriptions = json.loads(task_desc_path.read_text()) + except json.JSONDecodeError as exc: + print( + f"[parse_eval_metrics] Warning: could not parse task_descriptions.json: {exc}", + file=sys.stderr, + ) + + metrics = { + "env": args.env, + "task": args.task, + "policy": args.policy, + "pc_success": pc_success, + "n_episodes": n_episodes, + "avg_sum_reward": avg_sum_reward, + "eval_s": eval_s, + "task_descriptions": task_descriptions, + } + + out_path = artifacts_dir / "metrics.json" + out_path.write_text(json.dumps(metrics, indent=2)) + print(f"[parse_eval_metrics] Written: {out_path}") + print(json.dumps(metrics, indent=2)) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())