From c4d7e7468b4b9d192e385155be2bb5d95ca298d9 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 8 Apr 2026 14:33:48 +0200 Subject: [PATCH] chore: remove out-of-scope benchmark/CI/docs files from PR Benchmark CI workflow, Dockerfiles, benchmark docs, evaluation smoke-test doc, and dispatch tests belong in a separate PR. Scope this PR to the async env init changes only. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/benchmark_tests.yml | 171 ----------- docker/Dockerfile.benchmark.libero | 85 ------ docker/Dockerfile.benchmark.metaworld | 74 ----- docs/source/_toctree.yml | 2 - docs/source/adding_benchmarks.mdx | 397 -------------------------- docs/source/evaluation.mdx | 162 ----------- tests/envs/test_dispatch.py | 143 ---------- 7 files changed, 1034 deletions(-) delete mode 100644 .github/workflows/benchmark_tests.yml delete mode 100644 docker/Dockerfile.benchmark.libero delete mode 100644 docker/Dockerfile.benchmark.metaworld delete mode 100644 docs/source/adding_benchmarks.mdx delete mode 100644 docs/source/evaluation.mdx delete mode 100644 tests/envs/test_dispatch.py diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml deleted file mode 100644 index 52f73fe46..000000000 --- a/.github/workflows/benchmark_tests.yml +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Integration tests: build an isolated Docker image per benchmark and run a -# 1-episode smoke eval. Each benchmark gets its own image so incompatible -# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide. -# -# To add a new benchmark: -# 1. Add docker/Dockerfile.benchmark. (install only lerobot[]) -# 2. Copy one of the jobs below and adjust the image name and eval command. -name: Benchmark Integration Tests - -on: - # Run manually from the Actions tab - workflow_dispatch: - - push: - branches: - - feat/async-vector-env - paths: - - "src/lerobot/envs/**" - - "src/lerobot/scripts/lerobot_eval.py" - - "docker/Dockerfile.benchmark.*" - - ".github/workflows/benchmark_tests.yml" - - "pyproject.toml" - - pull_request: - branches: - - main - paths: - - "src/lerobot/envs/**" - - "src/lerobot/scripts/lerobot_eval.py" - - "docker/Dockerfile.benchmark.*" - - ".github/workflows/benchmark_tests.yml" - - "pyproject.toml" - -permissions: - contents: read - -env: - UV_VERSION: "0.8.0" - PYTHON_VERSION: "3.12" - -# Cancel in-flight runs for the same branch/PR. -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - # ── LIBERO ──────────────────────────────────────────────────────────────── - # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain) - libero-integration-test: - name: Libero — build image + 1-episode eval - runs-on: - group: aws-g6-4xlarge-plus - env: - HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} - - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - lfs: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] - with: - cache-binary: false - - # Build the benchmark-specific image; layer cache lives in the runner's - # local Docker daemon — reused across re-runs on the same machine. - - name: Build Libero benchmark image - uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] - with: - context: . - file: docker/Dockerfile.benchmark.libero - push: false - load: true - tags: lerobot-benchmark-libero:ci - cache-from: type=local,src=/tmp/.buildx-cache-libero - cache-to: type=local,dest=/tmp/.buildx-cache-libero,mode=max - - - name: Login to Hugging Face - if: env.HF_USER_TOKEN != '' - run: | - docker run --rm \ - -e HF_HOME=/tmp/hf \ - lerobot-benchmark-libero:ci \ - bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami" - - - name: Run Libero smoke eval (1 episode) - run: | - docker run --rm --gpus all \ - --shm-size=4g \ - -e HF_HOME=/tmp/hf \ - -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ - lerobot-benchmark-libero:ci \ - bash -c " - hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true - lerobot-eval \ - --policy.path=pepijn223/smolvla_libero \ - --env.type=libero \ - --env.task=libero_spatial \ - --eval.batch_size=1 \ - --eval.n_episodes=1 \ - --eval.use_async_envs=false \ - --policy.device=cuda \ - '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ - --policy.empty_cameras=1 - " - - # ── METAWORLD ───────────────────────────────────────────────────────────── - # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) - metaworld-integration-test: - name: MetaWorld — build image + 1-episode eval - runs-on: - group: aws-g6-4xlarge-plus - env: - HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} - - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - lfs: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] - with: - cache-binary: false - - - name: Build MetaWorld benchmark image - uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] - with: - context: . - file: docker/Dockerfile.benchmark.metaworld - push: false - load: true - tags: lerobot-benchmark-metaworld:ci - cache-from: type=local,src=/tmp/.buildx-cache-metaworld - cache-to: type=local,dest=/tmp/.buildx-cache-metaworld,mode=max - - - name: Run MetaWorld smoke eval (1 episode) - run: | - docker run --rm --gpus all \ - --shm-size=4g \ - -e HF_HOME=/tmp/hf \ - -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ - lerobot-benchmark-metaworld:ci \ - bash -c " - hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true - lerobot-eval \ - --policy.path=pepijn223/smolvla_metaworld \ - --env.type=metaworld \ - --env.task=metaworld-push-v2 \ - --eval.batch_size=1 \ - --eval.n_episodes=1 \ - --eval.use_async_envs=false \ - --policy.device=cuda - " diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero deleted file mode 100644 index b3969d491..000000000 --- a/docker/Dockerfile.benchmark.libero +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Isolated benchmark image for LIBERO integration tests. -# Installs only lerobot[libero] so its dep tree (hf-libero, dm-control, mujoco) -# cannot conflict with other benchmarks. -# -# Build: docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero . -# Run: docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ... - -ARG CUDA_VERSION=12.4.1 -ARG OS_VERSION=22.04 -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} - -ARG PYTHON_VERSION=3.12 - -ENV DEBIAN_FRONTEND=noninteractive \ - MUJOCO_GL=egl \ - PATH=/lerobot/.venv/bin:$PATH \ - CUDA_VISIBLE_DEVICES=0 \ - DEVICE=cuda - -# System deps — same set as Dockerfile.internal -RUN apt-get update && apt-get install -y --no-install-recommends \ - software-properties-common build-essential git curl \ - libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ - libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ - cmake pkg-config ninja-build \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - python${PYTHON_VERSION} \ - python${PYTHON_VERSION}-venv \ - python${PYTHON_VERSION}-dev \ - && curl -LsSf https://astral.sh/uv/install.sh | sh \ - && mv /root/.local/bin/uv /usr/local/bin/uv \ - && useradd --create-home --shell /bin/bash user_lerobot \ - && usermod -aG sudo user_lerobot \ - && apt-get clean && rm -rf /var/lib/apt/lists/* - -WORKDIR /lerobot -RUN chown -R user_lerobot:user_lerobot /lerobot -USER user_lerobot - -ENV HOME=/home/user_lerobot \ - HF_HOME=/home/user_lerobot/.cache/huggingface \ - HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ - TORCH_HOME=/home/user_lerobot/.cache/torch \ - TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton - -RUN uv venv --python python${PYTHON_VERSION} - -# Install only lerobot[libero] — completely isolated from metaworld's dep tree -COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ -COPY --chown=user_lerobot:user_lerobot src/ src/ - -RUN uv sync --locked --extra libero --extra smolvla --no-cache - -# Pre-create libero's config file pointing to the bundled package assets. -# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing. -# We use importlib.util.find_spec (does NOT execute libero, so no prompt) to locate -# the package, then write the config to the correct bundled paths. -RUN LIBERO_DIR=$(python${PYTHON_VERSION} -c \ - "import importlib.util, os; s=importlib.util.find_spec('libero'); \ - print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ - mkdir -p /home/user_lerobot/.libero && \ - printf "assets: ${LIBERO_DIR}/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ - > /home/user_lerobot/.libero/config.yaml - -RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas - -COPY --chown=user_lerobot:user_lerobot . . - -CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld deleted file mode 100644 index 0c916c553..000000000 --- a/docker/Dockerfile.benchmark.metaworld +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Isolated benchmark image for MetaWorld integration tests. -# Installs only lerobot[metaworld] so its dep tree (metaworld==3.0.0, mujoco>=3) -# cannot conflict with other benchmarks. -# -# Build: docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld . -# Run: docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ... - -ARG CUDA_VERSION=12.4.1 -ARG OS_VERSION=22.04 -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} - -ARG PYTHON_VERSION=3.12 - -ENV DEBIAN_FRONTEND=noninteractive \ - MUJOCO_GL=egl \ - PATH=/lerobot/.venv/bin:$PATH \ - CUDA_VISIBLE_DEVICES=0 \ - DEVICE=cuda - -# System deps — same set as Dockerfile.internal -RUN apt-get update && apt-get install -y --no-install-recommends \ - software-properties-common build-essential git curl \ - libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ - libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ - cmake pkg-config ninja-build \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - python${PYTHON_VERSION} \ - python${PYTHON_VERSION}-venv \ - python${PYTHON_VERSION}-dev \ - && curl -LsSf https://astral.sh/uv/install.sh | sh \ - && mv /root/.local/bin/uv /usr/local/bin/uv \ - && useradd --create-home --shell /bin/bash user_lerobot \ - && usermod -aG sudo user_lerobot \ - && apt-get clean && rm -rf /var/lib/apt/lists/* - -WORKDIR /lerobot -RUN chown -R user_lerobot:user_lerobot /lerobot -USER user_lerobot - -ENV HOME=/home/user_lerobot \ - HF_HOME=/home/user_lerobot/.cache/huggingface \ - HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \ - TORCH_HOME=/home/user_lerobot/.cache/torch \ - TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton - -RUN uv venv --python python${PYTHON_VERSION} - -# Install only lerobot[metaworld] — completely isolated from libero's dep tree -COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ -COPY --chown=user_lerobot:user_lerobot src/ src/ - -RUN uv sync --locked --extra metaworld --extra smolvla --no-cache - -RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas - -COPY --chown=user_lerobot:user_lerobot . . - -CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index f69f6d900..3dcba5993 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -73,8 +73,6 @@ title: Control & Train Robots in Sim (LeIsaac) title: "Simulation" - sections: - - local: evaluation - title: Evaluation (lerobot-eval) - local: adding_benchmarks title: Adding a New Benchmark - local: libero diff --git a/docs/source/adding_benchmarks.mdx b/docs/source/adding_benchmarks.mdx deleted file mode 100644 index 1b1df41b7..000000000 --- a/docs/source/adding_benchmarks.mdx +++ /dev/null @@ -1,397 +0,0 @@ -# Adding a New Benchmark - -This guide walks you through adding a new simulation benchmark to LeRobot. Follow the steps in order and use the existing benchmarks as templates. - -A benchmark in LeRobot is a set of [Gymnasium](https://gymnasium.farama.org/) environments that wrap a third-party simulator (like LIBERO or Meta-World) behind a standard `gym.Env` interface. The `lerobot-eval` CLI then runs evaluation uniformly across all benchmarks. - -## Existing benchmarks at a glance - -Before diving in, here is what is already integrated: - -| Benchmark | Env file | Config class | Tasks | Action dim | Processor | -| -------------- | ------------------- | ------------------ | ------------------- | ------------ | ---------------------------- | -| LIBERO | `envs/libero.py` | `LiberoEnv` | 130 across 5 suites | 7 | `LiberoProcessorStep` | -| Meta-World | `envs/metaworld.py` | `MetaworldEnv` | 50 (MT50) | 4 | None | -| IsaacLab Arena | Hub-hosted | `IsaaclabArenaEnv` | Configurable | Configurable | `IsaaclabArenaProcessorStep` | - -Use `src/lerobot/envs/libero.py` and `src/lerobot/envs/metaworld.py` as reference implementations. - -## How it all fits together - -### Data flow - -During evaluation, data moves through four stages: - -``` -1. gym.Env ──→ raw observations (numpy dicts) - -2. Preprocessing ──→ standard LeRobot keys + task description - (preprocess_observation in envs/utils.py, env.call("task_description")) - -3. Processors ──→ env-specific then policy-specific transforms - (env_preprocessor, policy_preprocessor) - -4. Policy ──→ select_action() ──→ action tensor - then reverse: policy_postprocessor → env_postprocessor → numpy action → env.step() -``` - -Most benchmarks only need to care about stage 1 (producing observations in the right format) and optionally stage 3 (if env-specific transforms are needed). - -### Environment structure - -`make_env()` returns a nested dict of vectorized environments: - -```python -dict[str, dict[int, gym.vector.VectorEnv]] -# ^suite ^task_id -``` - -A single-task env (e.g. PushT) looks like `{"pusht": {0: vec_env}}`. -A multi-task benchmark (e.g. LIBERO) looks like `{"libero_spatial": {0: vec0, 1: vec1, ...}, ...}`. - -### How evaluation runs - -All benchmarks are evaluated the same way by `lerobot-eval`: - -1. `make_env()` builds the nested `{suite: {task_id: VectorEnv}}` dict. -2. `eval_policy_all()` iterates over every suite and task. -3. For each task, it runs `n_episodes` rollouts via `rollout()`. -4. Results are aggregated hierarchically: episode, task, suite, overall. -5. Metrics include `pc_success` (success rate), `avg_sum_reward`, and `avg_max_reward`. - -The critical piece: your env must return `info["is_success"]` on every `step()` call. This is how the eval loop knows whether a task was completed. - -## What your environment must provide - -LeRobot does not enforce a strict observation schema. Instead it relies on a set of conventions that all benchmarks follow. - -### Env attributes - -Your `gym.Env` must set these attributes: - -| Attribute | Type | Why | -| -------------------- | ----- | ---------------------------------------------------- | -| `_max_episode_steps` | `int` | `rollout()` uses this to cap episode length | -| `task_description` | `str` | Passed to VLA policies as a language instruction | -| `task` | `str` | Fallback identifier if `task_description` is not set | - -### Success reporting - -Your `step()` and `reset()` must include `"is_success"` in the `info` dict: - -```python -info = {"is_success": True} # or False -return observation, reward, terminated, truncated, info -``` - -### Observations - -The simplest approach is to map your simulator's outputs to the standard keys that `preprocess_observation()` already understands. Do this inside your `gym.Env` (e.g. in a `_format_raw_obs()` helper): - -| Your env should output | LeRobot maps it to | What it is | -| ------------------------- | -------------------------- | ------------------------------------- | -| `"pixels"` (single array) | `observation.image` | Single camera image, HWC uint8 | -| `"pixels"` (dict) | `observation.images.` | Multiple cameras, each HWC uint8 | -| `"agent_pos"` | `observation.state` | Proprioceptive state vector | -| `"environment_state"` | `observation.env_state` | Full environment state (e.g. PushT) | -| `"robot_state"` | `observation.robot_state` | Nested robot state dict (e.g. LIBERO) | - -If your simulator uses different key names, you have two options: - -1. **Recommended:** Rename them to the standard keys inside your `gym.Env` wrapper. -2. **Alternative:** Write an env processor to transform observations after `preprocess_observation()` runs (see step 4 below). - -### Actions - -Actions are continuous numpy arrays in a `gym.spaces.Box`. The dimensionality depends on your benchmark (7 for LIBERO, 4 for Meta-World, etc.). Policies adapt to different action dimensions through their `input_features` / `output_features` config. - -### Feature declaration - -Each `EnvConfig` subclass declares two dicts that tell the policy what to expect: - -- `features` — maps feature names to `PolicyFeature(type, shape)` (e.g. action dim, image shape). -- `features_map` — maps raw observation keys to LeRobot convention keys (e.g. `"agent_pos"` to `"observation.state"`). - -## Step by step - - - At minimum, you need two files: a **gym.Env wrapper** and an **EnvConfig - subclass** with a `create_envs()` override. Everything else is optional or - documentation. No changes to `factory.py` are needed. - - -### Checklist - -| File | Required | Why | -| ----------------------------------------- | -------- | ------------------------------------------------------------ | -| `src/lerobot/envs/.py` | Yes | Wraps the simulator as a standard gym.Env | -| `src/lerobot/envs/configs.py` | Yes | Registers your benchmark and its `create_envs()` for the CLI | -| `src/lerobot/processor/env_processor.py` | Optional | Custom observation/action transforms | -| `src/lerobot/envs/utils.py` | Optional | Only if you need new raw observation keys | -| `pyproject.toml` | Yes | Declares benchmark-specific dependencies | -| `docs/source/.mdx` | Yes | User-facing documentation page | -| `docs/source/_toctree.yml` | Yes | Adds your page to the docs sidebar | -| `docker/Dockerfile.benchmark.` | Yes | Isolated Docker image for CI smoke tests | -| `.github/workflows/benchmark_tests.yml` | Yes | CI job that builds the image and runs a 1-episode smoke eval | - -### 1. The gym.Env wrapper (`src/lerobot/envs/.py`) - -Create a `gym.Env` subclass that wraps the third-party simulator: - -```python -class MyBenchmarkEnv(gym.Env): - metadata = {"render_modes": ["rgb_array"], "render_fps": } - - def __init__(self, task_suite, task_id, ...): - super().__init__() - self.task = - self.task_description = - self._max_episode_steps = - self.observation_space = spaces.Dict({...}) - self.action_space = spaces.Box(low=..., high=..., shape=(...,), dtype=np.float32) - - def reset(self, seed=None, **kwargs): - ... # return (observation, info) — info must contain {"is_success": False} - - def step(self, action: np.ndarray): - ... # return (obs, reward, terminated, truncated, info) — info must contain {"is_success": } - - def render(self): - ... # return RGB image as numpy array - - def close(self): - ... -``` - -**GPU-based simulators (e.g. MuJoCo with EGL rendering):** If your simulator allocates GPU/EGL contexts during `__init__`, defer that allocation to a `_ensure_env()` helper called on first `reset()`/`step()`. This avoids inheriting stale GPU handles when `AsyncVectorEnv` spawns worker processes. See `LiberoEnv._ensure_env()` for the pattern. - -Also provide a factory function that returns the nested dict structure: - -```python -def create_mybenchmark_envs( - task: str, - n_envs: int, - gym_kwargs: dict | None = None, - env_cls: type | None = None, -) -> dict[str, dict[int, Any]]: - """Create {suite_name: {task_id: VectorEnv}} for MyBenchmark.""" - ... -``` - -See `create_libero_envs()` (multi-suite, multi-task) and `create_metaworld_envs()` (difficulty-grouped tasks) for reference. - -### 2. The config (`src/lerobot/envs/configs.py`) - -Register a config dataclass so users can select your benchmark with `--env.type=`. Each config owns its environment creation and processor logic via two methods: - -- **`create_envs(n_envs, use_async_envs)`** — Returns `{suite: {task_id: VectorEnv}}`. The base class default uses `gym.make()` for single-task envs. Multi-task benchmarks override this. -- **`get_env_processors()`** — Returns `(preprocessor, postprocessor)`. The base class default returns identity (no-op) pipelines. Override if your benchmark needs observation/action transforms. - -```python -@EnvConfig.register_subclass("") -@dataclass -class MyBenchmarkEnvConfig(EnvConfig): - task: str = "" - fps: int = - obs_type: str = "pixels_agent_pos" - - features: dict[str, PolicyFeature] = field(default_factory=lambda: { - ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(,)), - }) - features_map: dict[str, str] = field(default_factory=lambda: { - ACTION: ACTION, - "agent_pos": OBS_STATE, - "pixels": OBS_IMAGE, - }) - - def __post_init__(self): - ... # populate features based on obs_type - - @property - def gym_kwargs(self) -> dict: - return {"obs_type": self.obs_type, "render_mode": self.render_mode} - - def create_envs(self, n_envs: int, use_async_envs: bool = True): - """Override for multi-task benchmarks or custom env creation.""" - from lerobot.envs. import create__envs - return create__envs(task=self.task, n_envs=n_envs, ...) - - def get_env_processors(self): - """Override if your benchmark needs observation/action transforms.""" - from lerobot.processor.pipeline import PolicyProcessorPipeline - from lerobot.processor.env_processor import MyBenchmarkProcessorStep - return ( - PolicyProcessorPipeline(steps=[MyBenchmarkProcessorStep()]), - PolicyProcessorPipeline(steps=[]), - ) -``` - -Key points: - -- The `register_subclass` name is what users pass on the CLI (`--env.type=`). -- `features` tells the policy what the environment produces. -- `features_map` maps raw observation keys to LeRobot convention keys. -- **No changes to `factory.py` needed** — the factory delegates to `cfg.create_envs()` and `cfg.get_env_processors()` automatically. - -### 3. Env processor (optional — `src/lerobot/processor/env_processor.py`) - -Only needed if your benchmark requires observation transforms beyond what `preprocess_observation()` handles (e.g. image flipping, coordinate conversion). Define the processor step here and return it from `get_env_processors()` in your config (see step 2): - -```python -@dataclass -@ProcessorStepRegistry.register(name="_processor") -class MyBenchmarkProcessorStep(ObservationProcessorStep): - def _process_observation(self, observation): - processed = observation.copy() - # your transforms here - return processed - - def transform_features(self, features): - return features # update if shapes change - - def observation(self, observation): - return self._process_observation(observation) -``` - -See `LiberoProcessorStep` for a full example (image rotation, quaternion-to-axis-angle conversion). - -### 4. Dependencies (`pyproject.toml`) - -Add a new optional-dependency group: - -```toml -mybenchmark = ["my-benchmark-pkg==1.2.3", "lerobot[scipy-dep]"] -``` - -Pinning rules: - -- **Always pin** benchmark packages to exact versions for reproducibility (e.g. `metaworld==3.0.0`). -- **Add platform markers** when needed (e.g. `; sys_platform == 'linux'`). -- **Pin fragile transitive deps** if known (e.g. `gymnasium==1.1.0` for Meta-World). -- **Document constraints** in your benchmark doc page. - -Users install with: - -```bash -pip install -e ".[mybenchmark]" -``` - -### 5. Documentation (`docs/source/.mdx`) - -Write a user-facing page following the template in the next section. See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for full examples. - -### 6. Table of contents (`docs/source/_toctree.yml`) - -Add your benchmark to the "Benchmarks" section: - -```yaml -- sections: - - local: libero - title: LIBERO - - local: metaworld - title: Meta-World - - local: envhub_isaaclab_arena - title: NVIDIA IsaacLab Arena Environments - - local: - title: - title: "Benchmarks" -``` - -### 7. CI smoke test (`docker/` + `.github/workflows/benchmark_tests.yml`) - -Each benchmark must have an isolated Docker image and a CI job that runs a 1-episode eval. This catches install-time regressions (broken transitive deps, import errors, interactive prompts) before they reach users. - -**Create `docker/Dockerfile.benchmark.`** — copy an existing one and change only the extra name: - -```dockerfile -# Isolated benchmark image — installs lerobot[] only. -# Build: docker build -f docker/Dockerfile.benchmark. -t lerobot-benchmark- . -ARG CUDA_VERSION=12.4.1 -ARG OS_VERSION=22.04 -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} -ARG PYTHON_VERSION=3.12 -# ... (same system deps as Dockerfile.benchmark.libero) ... -RUN uv sync --locked --extra --no-cache -``` - -Each benchmark gets its own image so its dependency tree (pinned simulator packages, specific mujoco/scipy versions) cannot conflict with other benchmarks. - -**Add a job to `.github/workflows/benchmark_tests.yml`** — copy an existing job block and adjust: - -```yaml --integration-test: - name: — build image + 1-episode eval - runs-on: - group: aws-g6-4xlarge-plus - env: - HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - lfs: true - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] - with: - cache-binary: false - - name: Build image - uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] - with: - context: . - file: docker/Dockerfile.benchmark. - push: false - load: true - tags: lerobot-benchmark-:ci - cache-from: type=local,src=/tmp/.buildx-cache- - cache-to: type=local,dest=/tmp/.buildx-cache-,mode=max - - name: Run smoke eval (1 episode) - run: | - docker run --rm --gpus all \ - --shm-size=4g \ - -e HF_HOME=/tmp/hf \ - -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ - lerobot-benchmark-:ci \ - bash -c " - hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true - lerobot-eval \ - --policy.path= \ - --env.type= \ - --env.task= \ - --eval.batch_size=1 \ - --eval.n_episodes=1 \ - --eval.use_async_envs=false \ - --policy.device=cuda - " -``` - -**Tips:** - -- If the benchmark library prompts for user input on import (like LIBERO asking for a dataset folder), pass the relevant env var in the `docker run` command (e.g. `-e LIBERO_DATA_FOLDER=/tmp/libero_data`). -- The job is scoped to only trigger on changes to `src/lerobot/envs/**`, `src/lerobot/scripts/lerobot_eval.py`, and the Dockerfiles — it won't run on unrelated PRs. - -## Verifying your integration - -After completing the steps above, confirm that everything works: - -1. **Install** — `pip install -e ".[mybenchmark]"` and verify the dependency group installs cleanly. -2. **Smoke test env creation** — call `make_env()` with your config in Python, check that the returned dict has the expected `{suite: {task_id: VectorEnv}}` shape, and that `reset()` returns observations with the right keys. -3. **Run a full eval** — `lerobot-eval --env.type= --env.task= --eval.n_episodes=1 --policy.path=` to exercise the full pipeline end-to-end. (`batch_size` defaults to auto-tuning based on CPU cores; pass `--eval.batch_size=1` to force a single environment.) -4. **Check success detection** — verify that `info["is_success"]` flips to `True` when the task is actually completed. This is what the eval loop uses to compute success rates. -5. **Add CI smoke test** — follow step 7 above to add a Dockerfile and CI job. This ensures the install stays green as dependencies evolve. - -## Writing a benchmark doc page - -Each benchmark `.mdx` page should include: - -- **Title and description** — 1-2 paragraphs on what the benchmark tests and why it matters. -- **Links** — paper, GitHub repo, project website (if available). -- **Overview image or GIF.** -- **Available tasks** — table of task suites with counts and brief descriptions. -- **Installation** — `pip install -e ".[]"` plus any extra steps (env vars, system packages). -- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` for reproducible results. `batch_size` defaults to auto; only specify it if needed. Include single-task and multi-task examples if applicable. See the [Evaluation guide](evaluation) for details. -- **Policy inputs and outputs** — observation keys with shapes, action space description. -- **Recommended evaluation episodes** — how many episodes per task is standard. -- **Training** — example `lerobot-train` command. -- **Reproducing published results** — link to pretrained model, eval command, results table (if available). - -See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for complete examples. diff --git a/docs/source/evaluation.mdx b/docs/source/evaluation.mdx deleted file mode 100644 index ecd0cc1d6..000000000 --- a/docs/source/evaluation.mdx +++ /dev/null @@ -1,162 +0,0 @@ -# Evaluation - -`lerobot-eval` runs a trained policy on a simulation benchmark and reports success rate, reward, and (optionally) episode videos. It handles environment creation, batched rollouts, and metric aggregation automatically. - -## Quick start - -Evaluate a Hub-hosted policy on LIBERO: - -```bash -lerobot-eval \ - --policy.path=pepijn223/smolvla_libero \ - --env.type=libero \ - --env.task=libero_spatial \ - --eval.n_episodes=10 \ - --policy.device=cuda -``` - -Evaluate a local checkpoint: - -```bash -lerobot-eval \ - --policy.path=outputs/train/act_pusht/checkpoints/005000/pretrained_model \ - --env.type=pusht \ - --eval.n_episodes=10 -``` - -`batch_size` defaults to **auto** (based on CPU cores). The script picks the right number of parallel environments for your machine. - -## Key flags - -| Flag | Default | Description | -| ----------------------- | -------------- | ------------------------------------------------------------------------------------- | -| `--policy.path` | required | Hub repo ID or local path to a pretrained model | -| `--env.type` | required | Benchmark name (`pusht`, `libero`, `metaworld`, etc.) | -| `--env.task` | varies | Task or suite name (e.g. `libero_spatial`, `libero_10`) | -| `--eval.n_episodes` | `50` | Total episodes to run (across all tasks) | -| `--eval.batch_size` | `0` (auto) | Number of parallel environments. `0` = auto-tune from CPU cores | -| `--eval.use_async_envs` | `true` | Use `AsyncVectorEnv` (parallel stepping). Auto-downgrades to sync when `batch_size=1` | -| `--policy.device` | `cuda` | Inference device | -| `--policy.use_amp` | `false` | Mixed-precision inference (saves VRAM, faster on Ampere+) | -| `--seed` | `1000` | Random seed for reproducibility | -| `--output_dir` | auto-generated | Where to write results and videos | - -### Environment-specific flags - -Some benchmarks accept additional flags through `--env.*`: - -```bash -# LIBERO: map simulator camera names to policy feature names ---env.camera_name_mapping='{"agentview_image": "camera1", "robot0_eye_in_hand_image": "camera2"}' - -# Fill unused camera slots with zeros ---policy.empty_cameras=1 -``` - -See each benchmark's documentation ([LIBERO](libero), [Meta-World](metaworld)) for benchmark-specific flags. - -## How batch_size works - -`batch_size` controls how many environments run in parallel within a single `VectorEnv`: - -| `batch_size` | Behavior | -| ------------- | -------------------------------------------------------------------- | -| `0` (default) | Auto-tune: `floor(cpu_cores × 0.7)`, capped by `n_episodes` and `64` | -| `1` | Single environment, synchronous. Useful for debugging | -| `N` | N environments step in parallel via `AsyncVectorEnv` | - -When `batch_size > 1` and `use_async_envs=true`, each environment runs in its own subprocess via Gymnasium's `AsyncVectorEnv`. This parallelizes the simulation stepping (the main bottleneck), while the policy runs a single batched forward pass on GPU. - -**Example:** On a 16-core machine with `n_episodes=100`: - -- Auto batch_size = `floor(16 × 0.7)` = `11` -- 11 environments step simultaneously → ~11× faster than sequential - -## Performance - -### AsyncVectorEnv (default) - -`AsyncVectorEnv` spawns one subprocess per environment. Each subprocess has its own simulator instance. While the policy computes actions on GPU, all environments step in parallel on CPU: - -``` -GPU: [inference]....[inference]....[inference].... -CPU: [step × N]....................[step × N]...... - ↑ parallel ↑ parallel -``` - -For GPU-based simulators (LIBERO, Meta-World), the environments use **lazy initialization**: the GPU/EGL context is created inside the worker subprocess on first `reset()`, not in the parent process. This avoids `EGL_BAD_CONTEXT` crashes from inheriting stale GPU handles across `fork()`. - -### Lazy task loading - -For multi-task benchmarks (e.g. LIBERO with 10 tasks), environments are wrapped in `_LazyAsyncVectorEnv` which defers worker creation until the task is actually evaluated. This keeps peak process count = `batch_size` instead of `n_tasks × batch_size`. After each task completes, workers are closed to free resources. - -### Tuning for speed - -| Situation | Recommendation | -| ------------------------------ | ----------------------------------------------------- | -| Slow eval, low GPU utilization | Increase `batch_size` (or leave at auto) | -| Out of memory (system RAM) | Decrease `batch_size` | -| Out of GPU memory | Decrease `batch_size`, or use `--policy.use_amp=true` | -| Debugging / single-stepping | `--eval.batch_size=1 --eval.use_async_envs=false` | - -## Output - -Results are written to `output_dir` (default: `outputs/eval//