From 437014926f066eba0e6475d200723dfb8f1cf032 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 8 Apr 2026 14:44:59 +0200 Subject: [PATCH] feat(ci): add benchmark smoke tests with isolated Docker images Each benchmark gets its own image (lerobot[,smolvla]) so incompatible dep trees can never collide. A 1-episode smoke eval runs per benchmark on GPU runners. - Libero: pepijn223/smolvla_libero, libero_spatial, camera_name_mapping - MetaWorld: pepijn223/smolvla_metaworld, metaworld-push-v2 - LIBERO config pre-created at build time to bypass interactive stdin prompt - Triggers on envs/**, lerobot_eval.py, Dockerfiles, pyproject.toml changes - Adds docs/source/evaluation.mdx and restores step 7 in adding_benchmarks Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/benchmark_tests.yml | 173 ++++++++++++++++++++++++++ docker/Dockerfile.benchmark.libero | 85 +++++++++++++ docker/Dockerfile.benchmark.metaworld | 74 +++++++++++ docs/source/_toctree.yml | 2 + docs/source/adding_benchmarks.mdx | 95 ++++++++++++-- docs/source/evaluation.mdx | 162 ++++++++++++++++++++++++ 6 files changed, 581 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/benchmark_tests.yml create mode 100644 docker/Dockerfile.benchmark.libero create mode 100644 docker/Dockerfile.benchmark.metaworld create mode 100644 docs/source/evaluation.mdx diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml new file mode 100644 index 000000000..c7a5e0dca --- /dev/null +++ b/.github/workflows/benchmark_tests.yml @@ -0,0 +1,173 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Integration tests: build an isolated Docker image per benchmark and run a +# 1-episode smoke eval. Each benchmark gets its own image so incompatible +# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide. +# +# To add a new benchmark: +# 1. Add docker/Dockerfile.benchmark. (install only lerobot[]) +# 2. Copy one of the jobs below and adjust the image name and eval command. +name: Benchmark Integration Tests + +on: + # Run manually from the Actions tab + workflow_dispatch: + + push: + branches: + - feat/benchmark-ci + - main + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + + pull_request: + branches: + - main + paths: + - "src/lerobot/envs/**" + - "src/lerobot/scripts/lerobot_eval.py" + - "docker/Dockerfile.benchmark.*" + - ".github/workflows/benchmark_tests.yml" + - "pyproject.toml" + +permissions: + contents: read + +env: + UV_VERSION: "0.8.0" + PYTHON_VERSION: "3.12" + +# Cancel in-flight runs for the same branch/PR. +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + # ── LIBERO ──────────────────────────────────────────────────────────────── + # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain) + libero-integration-test: + name: Libero — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + # Build the benchmark-specific image; layer cache lives in the runner's + # local Docker daemon — reused across re-runs on the same machine. + - name: Build Libero benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero + push: false + load: true + tags: lerobot-benchmark-libero:ci + cache-from: type=local,src=/tmp/.buildx-cache-libero + cache-to: type=local,dest=/tmp/.buildx-cache-libero,mode=max + + - name: Login to Hugging Face + if: env.HF_USER_TOKEN != '' + run: | + docker run --rm \ + -e HF_HOME=/tmp/hf \ + lerobot-benchmark-libero:ci \ + bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami" + + - name: Run Libero smoke eval (1 episode) + run: | + docker run --rm --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + lerobot-benchmark-libero:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_libero \ + --env.type=libero \ + --env.task=libero_spatial \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 + " + + # ── METAWORLD ───────────────────────────────────────────────────────────── + # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) + metaworld-integration-test: + name: MetaWorld — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Build MetaWorld benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.metaworld + push: false + load: true + tags: lerobot-benchmark-metaworld:ci + cache-from: type=local,src=/tmp/.buildx-cache-metaworld + cache-to: type=local,dest=/tmp/.buildx-cache-metaworld,mode=max + + - name: Run MetaWorld smoke eval (1 episode) + run: | + docker run --rm --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + lerobot-benchmark-metaworld:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=pepijn223/smolvla_metaworld \ + --env.type=metaworld \ + --env.task=metaworld-push-v2 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda + # TODO: add --env.rename_map once implemented + " diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero new file mode 100644 index 000000000..b3969d491 --- /dev/null +++ b/docker/Dockerfile.benchmark.libero @@ -0,0 +1,85 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Isolated benchmark image for LIBERO integration tests. +# Installs only lerobot[libero] so its dep tree (hf-libero, dm-control, mujoco) +# cannot conflict with other benchmarks. +# +# Build: docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero . +# Run: docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ... + +ARG CUDA_VERSION=12.4.1 +ARG OS_VERSION=22.04 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} + +ARG PYTHON_VERSION=3.12 + +ENV DEBIAN_FRONTEND=noninteractive \ + MUJOCO_GL=egl \ + PATH=/lerobot/.venv/bin:$PATH \ + CUDA_VISIBLE_DEVICES=0 \ + DEVICE=cuda + +# System deps — same set as Dockerfile.internal +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common build-essential git curl \ + libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ + libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ + cmake pkg-config ninja-build \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && mv /root/.local/bin/uv /usr/local/bin/uv \ + && useradd --create-home --shell /bin/bash user_lerobot \ + && usermod -aG sudo user_lerobot \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /lerobot +RUN chown -R user_lerobot:user_lerobot /lerobot +USER user_lerobot + +ENV HOME=/home/user_lerobot \ + HF_HOME=/home/user_lerobot/.cache/huggingface \ + HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ + TORCH_HOME=/home/user_lerobot/.cache/torch \ + TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton + +RUN uv venv --python python${PYTHON_VERSION} + +# Install only lerobot[libero] — completely isolated from metaworld's dep tree +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +COPY --chown=user_lerobot:user_lerobot src/ src/ + +RUN uv sync --locked --extra libero --extra smolvla --no-cache + +# Pre-create libero's config file pointing to the bundled package assets. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing. +# We use importlib.util.find_spec (does NOT execute libero, so no prompt) to locate +# the package, then write the config to the correct bundled paths. +RUN LIBERO_DIR=$(python${PYTHON_VERSION} -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + printf "assets: ${LIBERO_DIR}/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas + +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld new file mode 100644 index 000000000..0c916c553 --- /dev/null +++ b/docker/Dockerfile.benchmark.metaworld @@ -0,0 +1,74 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Isolated benchmark image for MetaWorld integration tests. +# Installs only lerobot[metaworld] so its dep tree (metaworld==3.0.0, mujoco>=3) +# cannot conflict with other benchmarks. +# +# Build: docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld . +# Run: docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ... + +ARG CUDA_VERSION=12.4.1 +ARG OS_VERSION=22.04 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} + +ARG PYTHON_VERSION=3.12 + +ENV DEBIAN_FRONTEND=noninteractive \ + MUJOCO_GL=egl \ + PATH=/lerobot/.venv/bin:$PATH \ + CUDA_VISIBLE_DEVICES=0 \ + DEVICE=cuda + +# System deps — same set as Dockerfile.internal +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common build-essential git curl \ + libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ + libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ + cmake pkg-config ninja-build \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && mv /root/.local/bin/uv /usr/local/bin/uv \ + && useradd --create-home --shell /bin/bash user_lerobot \ + && usermod -aG sudo user_lerobot \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /lerobot +RUN chown -R user_lerobot:user_lerobot /lerobot +USER user_lerobot + +ENV HOME=/home/user_lerobot \ + HF_HOME=/home/user_lerobot/.cache/huggingface \ + HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ + TORCH_HOME=/home/user_lerobot/.cache/torch \ + TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton + +RUN uv venv --python python${PYTHON_VERSION} + +# Install only lerobot[metaworld] — completely isolated from libero's dep tree +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +COPY --chown=user_lerobot:user_lerobot src/ src/ + +RUN uv sync --locked --extra metaworld --extra smolvla --no-cache + +RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas + +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 3dcba5993..f69f6d900 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -73,6 +73,8 @@ title: Control & Train Robots in Sim (LeIsaac) title: "Simulation" - sections: + - local: evaluation + title: Evaluation (lerobot-eval) - local: adding_benchmarks title: Adding a New Benchmark - local: libero diff --git a/docs/source/adding_benchmarks.mdx b/docs/source/adding_benchmarks.mdx index 3a024f026..1b1df41b7 100644 --- a/docs/source/adding_benchmarks.mdx +++ b/docs/source/adding_benchmarks.mdx @@ -122,15 +122,17 @@ Each `EnvConfig` subclass declares two dicts that tell the policy what to expect ### Checklist -| File | Required | Why | -| ---------------------------------------- | -------- | ------------------------------------------------------------ | -| `src/lerobot/envs/.py` | Yes | Wraps the simulator as a standard gym.Env | -| `src/lerobot/envs/configs.py` | Yes | Registers your benchmark and its `create_envs()` for the CLI | -| `src/lerobot/processor/env_processor.py` | Optional | Custom observation/action transforms | -| `src/lerobot/envs/utils.py` | Optional | Only if you need new raw observation keys | -| `pyproject.toml` | Yes | Declares benchmark-specific dependencies | -| `docs/source/.mdx` | Yes | User-facing documentation page | -| `docs/source/_toctree.yml` | Yes | Adds your page to the docs sidebar | +| File | Required | Why | +| ----------------------------------------- | -------- | ------------------------------------------------------------ | +| `src/lerobot/envs/.py` | Yes | Wraps the simulator as a standard gym.Env | +| `src/lerobot/envs/configs.py` | Yes | Registers your benchmark and its `create_envs()` for the CLI | +| `src/lerobot/processor/env_processor.py` | Optional | Custom observation/action transforms | +| `src/lerobot/envs/utils.py` | Optional | Only if you need new raw observation keys | +| `pyproject.toml` | Yes | Declares benchmark-specific dependencies | +| `docs/source/.mdx` | Yes | User-facing documentation page | +| `docs/source/_toctree.yml` | Yes | Adds your page to the docs sidebar | +| `docker/Dockerfile.benchmark.` | Yes | Isolated Docker image for CI smoke tests | +| `.github/workflows/benchmark_tests.yml` | Yes | CI job that builds the image and runs a 1-episode smoke eval | ### 1. The gym.Env wrapper (`src/lerobot/envs/.py`) @@ -295,6 +297,78 @@ Add your benchmark to the "Benchmarks" section: title: "Benchmarks" ``` +### 7. CI smoke test (`docker/` + `.github/workflows/benchmark_tests.yml`) + +Each benchmark must have an isolated Docker image and a CI job that runs a 1-episode eval. This catches install-time regressions (broken transitive deps, import errors, interactive prompts) before they reach users. + +**Create `docker/Dockerfile.benchmark.`** — copy an existing one and change only the extra name: + +```dockerfile +# Isolated benchmark image — installs lerobot[] only. +# Build: docker build -f docker/Dockerfile.benchmark. -t lerobot-benchmark- . +ARG CUDA_VERSION=12.4.1 +ARG OS_VERSION=22.04 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} +ARG PYTHON_VERSION=3.12 +# ... (same system deps as Dockerfile.benchmark.libero) ... +RUN uv sync --locked --extra --no-cache +``` + +Each benchmark gets its own image so its dependency tree (pinned simulator packages, specific mujoco/scipy versions) cannot conflict with other benchmarks. + +**Add a job to `.github/workflows/benchmark_tests.yml`** — copy an existing job block and adjust: + +```yaml +-integration-test: + name: — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + - name: Build image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark. + push: false + load: true + tags: lerobot-benchmark-:ci + cache-from: type=local,src=/tmp/.buildx-cache- + cache-to: type=local,dest=/tmp/.buildx-cache-,mode=max + - name: Run smoke eval (1 episode) + run: | + docker run --rm --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + lerobot-benchmark-:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path= \ + --env.type= \ + --env.task= \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda + " +``` + +**Tips:** + +- If the benchmark library prompts for user input on import (like LIBERO asking for a dataset folder), pass the relevant env var in the `docker run` command (e.g. `-e LIBERO_DATA_FOLDER=/tmp/libero_data`). +- The job is scoped to only trigger on changes to `src/lerobot/envs/**`, `src/lerobot/scripts/lerobot_eval.py`, and the Dockerfiles — it won't run on unrelated PRs. + ## Verifying your integration After completing the steps above, confirm that everything works: @@ -303,6 +377,7 @@ After completing the steps above, confirm that everything works: 2. **Smoke test env creation** — call `make_env()` with your config in Python, check that the returned dict has the expected `{suite: {task_id: VectorEnv}}` shape, and that `reset()` returns observations with the right keys. 3. **Run a full eval** — `lerobot-eval --env.type= --env.task= --eval.n_episodes=1 --policy.path=` to exercise the full pipeline end-to-end. (`batch_size` defaults to auto-tuning based on CPU cores; pass `--eval.batch_size=1` to force a single environment.) 4. **Check success detection** — verify that `info["is_success"]` flips to `True` when the task is actually completed. This is what the eval loop uses to compute success rates. +5. **Add CI smoke test** — follow step 7 above to add a Dockerfile and CI job. This ensures the install stays green as dependencies evolve. ## Writing a benchmark doc page @@ -313,7 +388,7 @@ Each benchmark `.mdx` page should include: - **Overview image or GIF.** - **Available tasks** — table of task suites with counts and brief descriptions. - **Installation** — `pip install -e ".[]"` plus any extra steps (env vars, system packages). -- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` for reproducible results. `batch_size` defaults to auto; only specify it if needed. Include single-task and multi-task examples if applicable. +- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` for reproducible results. `batch_size` defaults to auto; only specify it if needed. Include single-task and multi-task examples if applicable. See the [Evaluation guide](evaluation) for details. - **Policy inputs and outputs** — observation keys with shapes, action space description. - **Recommended evaluation episodes** — how many episodes per task is standard. - **Training** — example `lerobot-train` command. diff --git a/docs/source/evaluation.mdx b/docs/source/evaluation.mdx new file mode 100644 index 000000000..ecd0cc1d6 --- /dev/null +++ b/docs/source/evaluation.mdx @@ -0,0 +1,162 @@ +# Evaluation + +`lerobot-eval` runs a trained policy on a simulation benchmark and reports success rate, reward, and (optionally) episode videos. It handles environment creation, batched rollouts, and metric aggregation automatically. + +## Quick start + +Evaluate a Hub-hosted policy on LIBERO: + +```bash +lerobot-eval \ + --policy.path=pepijn223/smolvla_libero \ + --env.type=libero \ + --env.task=libero_spatial \ + --eval.n_episodes=10 \ + --policy.device=cuda +``` + +Evaluate a local checkpoint: + +```bash +lerobot-eval \ + --policy.path=outputs/train/act_pusht/checkpoints/005000/pretrained_model \ + --env.type=pusht \ + --eval.n_episodes=10 +``` + +`batch_size` defaults to **auto** (based on CPU cores). The script picks the right number of parallel environments for your machine. + +## Key flags + +| Flag | Default | Description | +| ----------------------- | -------------- | ------------------------------------------------------------------------------------- | +| `--policy.path` | required | Hub repo ID or local path to a pretrained model | +| `--env.type` | required | Benchmark name (`pusht`, `libero`, `metaworld`, etc.) | +| `--env.task` | varies | Task or suite name (e.g. `libero_spatial`, `libero_10`) | +| `--eval.n_episodes` | `50` | Total episodes to run (across all tasks) | +| `--eval.batch_size` | `0` (auto) | Number of parallel environments. `0` = auto-tune from CPU cores | +| `--eval.use_async_envs` | `true` | Use `AsyncVectorEnv` (parallel stepping). Auto-downgrades to sync when `batch_size=1` | +| `--policy.device` | `cuda` | Inference device | +| `--policy.use_amp` | `false` | Mixed-precision inference (saves VRAM, faster on Ampere+) | +| `--seed` | `1000` | Random seed for reproducibility | +| `--output_dir` | auto-generated | Where to write results and videos | + +### Environment-specific flags + +Some benchmarks accept additional flags through `--env.*`: + +```bash +# LIBERO: map simulator camera names to policy feature names +--env.camera_name_mapping='{"agentview_image": "camera1", "robot0_eye_in_hand_image": "camera2"}' + +# Fill unused camera slots with zeros +--policy.empty_cameras=1 +``` + +See each benchmark's documentation ([LIBERO](libero), [Meta-World](metaworld)) for benchmark-specific flags. + +## How batch_size works + +`batch_size` controls how many environments run in parallel within a single `VectorEnv`: + +| `batch_size` | Behavior | +| ------------- | -------------------------------------------------------------------- | +| `0` (default) | Auto-tune: `floor(cpu_cores × 0.7)`, capped by `n_episodes` and `64` | +| `1` | Single environment, synchronous. Useful for debugging | +| `N` | N environments step in parallel via `AsyncVectorEnv` | + +When `batch_size > 1` and `use_async_envs=true`, each environment runs in its own subprocess via Gymnasium's `AsyncVectorEnv`. This parallelizes the simulation stepping (the main bottleneck), while the policy runs a single batched forward pass on GPU. + +**Example:** On a 16-core machine with `n_episodes=100`: + +- Auto batch_size = `floor(16 × 0.7)` = `11` +- 11 environments step simultaneously → ~11× faster than sequential + +## Performance + +### AsyncVectorEnv (default) + +`AsyncVectorEnv` spawns one subprocess per environment. Each subprocess has its own simulator instance. While the policy computes actions on GPU, all environments step in parallel on CPU: + +``` +GPU: [inference]....[inference]....[inference].... +CPU: [step × N]....................[step × N]...... + ↑ parallel ↑ parallel +``` + +For GPU-based simulators (LIBERO, Meta-World), the environments use **lazy initialization**: the GPU/EGL context is created inside the worker subprocess on first `reset()`, not in the parent process. This avoids `EGL_BAD_CONTEXT` crashes from inheriting stale GPU handles across `fork()`. + +### Lazy task loading + +For multi-task benchmarks (e.g. LIBERO with 10 tasks), environments are wrapped in `_LazyAsyncVectorEnv` which defers worker creation until the task is actually evaluated. This keeps peak process count = `batch_size` instead of `n_tasks × batch_size`. After each task completes, workers are closed to free resources. + +### Tuning for speed + +| Situation | Recommendation | +| ------------------------------ | ----------------------------------------------------- | +| Slow eval, low GPU utilization | Increase `batch_size` (or leave at auto) | +| Out of memory (system RAM) | Decrease `batch_size` | +| Out of GPU memory | Decrease `batch_size`, or use `--policy.use_amp=true` | +| Debugging / single-stepping | `--eval.batch_size=1 --eval.use_async_envs=false` | + +## Output + +Results are written to `output_dir` (default: `outputs/eval//