From c4d7e7468b4b9d192e385155be2bb5d95ca298d9 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 8 Apr 2026 14:33:48 +0200
Subject: [PATCH] chore: remove out-of-scope benchmark/CI/docs files from PR

Benchmark CI workflow, Dockerfiles, benchmark docs, evaluation smoke-test
doc, and dispatch tests belong in a separate PR. Scope this PR to the
async env init changes only.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/benchmark_tests.yml | 171 -----------
 docker/Dockerfile.benchmark.libero    |  85 ------
 docker/Dockerfile.benchmark.metaworld |  74 -----
 docs/source/_toctree.yml              |   2 -
 docs/source/adding_benchmarks.mdx     | 397 --------------------------
 docs/source/evaluation.mdx            | 162 -----------
 tests/envs/test_dispatch.py           | 143 ----------
 7 files changed, 1034 deletions(-)
 delete mode 100644 .github/workflows/benchmark_tests.yml
 delete mode 100644 docker/Dockerfile.benchmark.libero
 delete mode 100644 docker/Dockerfile.benchmark.metaworld
 delete mode 100644 docs/source/adding_benchmarks.mdx
 delete mode 100644 docs/source/evaluation.mdx
 delete mode 100644 tests/envs/test_dispatch.py
diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
deleted file mode 100644
index 52f73fe46..000000000
--- a/.github/workflows/benchmark_tests.yml
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Integration tests: build an isolated Docker image per benchmark and run a
-# 1-episode smoke eval. Each benchmark gets its own image so incompatible
-# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide.
-#
-# To add a new benchmark:
-#   1. Add docker/Dockerfile.benchmark.<name>  (install only lerobot[<name>])
-#   2. Copy one of the jobs below and adjust the image name and eval command.
-name: Benchmark Integration Tests
-
-on:
-  # Run manually from the Actions tab
-  workflow_dispatch:
-
-  push:
-    branches:
-      - feat/async-vector-env
-    paths:
-      - "src/lerobot/envs/**"
-      - "src/lerobot/scripts/lerobot_eval.py"
-      - "docker/Dockerfile.benchmark.*"
-      - ".github/workflows/benchmark_tests.yml"
-      - "pyproject.toml"
-
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "src/lerobot/envs/**"
-      - "src/lerobot/scripts/lerobot_eval.py"
-      - "docker/Dockerfile.benchmark.*"
-      - ".github/workflows/benchmark_tests.yml"
-      - "pyproject.toml"
-
-permissions:
-  contents: read
-
-env:
-  UV_VERSION: "0.8.0"
-  PYTHON_VERSION: "3.12"
-
-# Cancel in-flight runs for the same branch/PR.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  # ── LIBERO ────────────────────────────────────────────────────────────────
-  # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain)
-  libero-integration-test:
-    name: Libero — build image + 1-episode eval
-    runs-on:
-      group: aws-g6-4xlarge-plus
-    env:
-      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
-
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          lfs: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
-        with:
-          cache-binary: false
-
-      # Build the benchmark-specific image; layer cache lives in the runner's
-      # local Docker daemon — reused across re-runs on the same machine.
-      - name: Build Libero benchmark image
-        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
-        with:
-          context: .
-          file: docker/Dockerfile.benchmark.libero
-          push: false
-          load: true
-          tags: lerobot-benchmark-libero:ci
-          cache-from: type=local,src=/tmp/.buildx-cache-libero
-          cache-to: type=local,dest=/tmp/.buildx-cache-libero,mode=max
-
-      - name: Login to Hugging Face
-        if: env.HF_USER_TOKEN != ''
-        run: |
-          docker run --rm \
-            -e HF_HOME=/tmp/hf \
-            lerobot-benchmark-libero:ci \
-            bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami"
-
-      - name: Run Libero smoke eval (1 episode)
-        run: |
-          docker run --rm --gpus all \
-            --shm-size=4g \
-            -e HF_HOME=/tmp/hf \
-            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-            lerobot-benchmark-libero:ci \
-            bash -c "
-              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
-              lerobot-eval \
-                --policy.path=pepijn223/smolvla_libero \
-                --env.type=libero \
-                --env.task=libero_spatial \
-                --eval.batch_size=1 \
-                --eval.n_episodes=1 \
-                --eval.use_async_envs=false \
-                --policy.device=cuda \
-                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
-                --policy.empty_cameras=1
-            "
-
-  # ── METAWORLD ─────────────────────────────────────────────────────────────
-  # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
-  metaworld-integration-test:
-    name: MetaWorld — build image + 1-episode eval
-    runs-on:
-      group: aws-g6-4xlarge-plus
-    env:
-      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
-
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          lfs: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
-        with:
-          cache-binary: false
-
-      - name: Build MetaWorld benchmark image
-        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
-        with:
-          context: .
-          file: docker/Dockerfile.benchmark.metaworld
-          push: false
-          load: true
-          tags: lerobot-benchmark-metaworld:ci
-          cache-from: type=local,src=/tmp/.buildx-cache-metaworld
-          cache-to: type=local,dest=/tmp/.buildx-cache-metaworld,mode=max
-
-      - name: Run MetaWorld smoke eval (1 episode)
-        run: |
-          docker run --rm --gpus all \
-            --shm-size=4g \
-            -e HF_HOME=/tmp/hf \
-            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-            lerobot-benchmark-metaworld:ci \
-            bash -c "
-              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
-              lerobot-eval \
-                --policy.path=pepijn223/smolvla_metaworld \
-                --env.type=metaworld \
-                --env.task=metaworld-push-v2 \
-                --eval.batch_size=1 \
-                --eval.n_episodes=1 \
-                --eval.use_async_envs=false \
-                --policy.device=cuda
-            "
diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero
deleted file mode 100644
index b3969d491..000000000
--- a/docker/Dockerfile.benchmark.libero
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Isolated benchmark image for LIBERO integration tests.
-# Installs only lerobot[libero] so its dep tree (hf-libero, dm-control, mujoco)
-# cannot conflict with other benchmarks.
-#
-# Build:  docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero .
-# Run:    docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ...
-
-ARG CUDA_VERSION=12.4.1
-ARG OS_VERSION=22.04
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
-
-ARG PYTHON_VERSION=3.12
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    MUJOCO_GL=egl \
-    PATH=/lerobot/.venv/bin:$PATH \
-    CUDA_VISIBLE_DEVICES=0 \
-    DEVICE=cuda
-
-# System deps — same set as Dockerfile.internal
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    software-properties-common build-essential git curl \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
-    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
-    cmake pkg-config ninja-build \
-    && add-apt-repository -y ppa:deadsnakes/ppa \
-    && apt-get update \
-    && apt-get install -y --no-install-recommends \
-       python${PYTHON_VERSION} \
-       python${PYTHON_VERSION}-venv \
-       python${PYTHON_VERSION}-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && mv /root/.local/bin/uv /usr/local/bin/uv \
-    && useradd --create-home --shell /bin/bash user_lerobot \
-    && usermod -aG sudo user_lerobot \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /lerobot
-RUN chown -R user_lerobot:user_lerobot /lerobot
-USER user_lerobot
-
-ENV HOME=/home/user_lerobot \
-    HF_HOME=/home/user_lerobot/.cache/huggingface \
-    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
-    TORCH_HOME=/home/user_lerobot/.cache/torch \
-    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
-
-RUN uv venv --python python${PYTHON_VERSION}
-
-# Install only lerobot[libero] — completely isolated from metaworld's dep tree
-COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
-COPY --chown=user_lerobot:user_lerobot src/ src/
-
-RUN uv sync --locked --extra libero --extra smolvla --no-cache
-
-# Pre-create libero's config file pointing to the bundled package assets.
-# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing.
-# We use importlib.util.find_spec (does NOT execute libero, so no prompt) to locate
-# the package, then write the config to the correct bundled paths.
-RUN LIBERO_DIR=$(python${PYTHON_VERSION} -c \
-      "import importlib.util, os; s=importlib.util.find_spec('libero'); \
-       print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
-    mkdir -p /home/user_lerobot/.libero && \
-    printf "assets: ${LIBERO_DIR}/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
-    > /home/user_lerobot/.libero/config.yaml
-
-RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
-
-COPY --chown=user_lerobot:user_lerobot . .
-
-CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld
deleted file mode 100644
index 0c916c553..000000000
--- a/docker/Dockerfile.benchmark.metaworld
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Isolated benchmark image for MetaWorld integration tests.
-# Installs only lerobot[metaworld] so its dep tree (metaworld==3.0.0, mujoco>=3)
-# cannot conflict with other benchmarks.
-#
-# Build:  docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld .
-# Run:    docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ...
-
-ARG CUDA_VERSION=12.4.1
-ARG OS_VERSION=22.04
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
-
-ARG PYTHON_VERSION=3.12
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    MUJOCO_GL=egl \
-    PATH=/lerobot/.venv/bin:$PATH \
-    CUDA_VISIBLE_DEVICES=0 \
-    DEVICE=cuda
-
-# System deps — same set as Dockerfile.internal
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    software-properties-common build-essential git curl \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
-    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
-    cmake pkg-config ninja-build \
-    && add-apt-repository -y ppa:deadsnakes/ppa \
-    && apt-get update \
-    && apt-get install -y --no-install-recommends \
-       python${PYTHON_VERSION} \
-       python${PYTHON_VERSION}-venv \
-       python${PYTHON_VERSION}-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && mv /root/.local/bin/uv /usr/local/bin/uv \
-    && useradd --create-home --shell /bin/bash user_lerobot \
-    && usermod -aG sudo user_lerobot \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /lerobot
-RUN chown -R user_lerobot:user_lerobot /lerobot
-USER user_lerobot
-
-ENV HOME=/home/user_lerobot \
-    HF_HOME=/home/user_lerobot/.cache/huggingface \
-    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
-    TORCH_HOME=/home/user_lerobot/.cache/torch \
-    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
-
-RUN uv venv --python python${PYTHON_VERSION}
-
-# Install only lerobot[metaworld] — completely isolated from libero's dep tree
-COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
-COPY --chown=user_lerobot:user_lerobot src/ src/
-
-RUN uv sync --locked --extra metaworld --extra smolvla --no-cache
-
-RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
-
-COPY --chown=user_lerobot:user_lerobot . .
-
-CMD ["/bin/bash"]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index f69f6d900..3dcba5993 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -73,8 +73,6 @@
     title: Control & Train Robots in Sim (LeIsaac)
   title: "Simulation"
 - sections:
-  - local: evaluation
-    title: Evaluation (lerobot-eval)
   - local: adding_benchmarks
     title: Adding a New Benchmark
   - local: libero
diff --git a/docs/source/adding_benchmarks.mdx b/docs/source/adding_benchmarks.mdx
deleted file mode 100644
index 1b1df41b7..000000000
--- a/docs/source/adding_benchmarks.mdx
+++ /dev/null
@@ -1,397 +0,0 @@
-# Adding a New Benchmark
-
-This guide walks you through adding a new simulation benchmark to LeRobot. Follow the steps in order and use the existing benchmarks as templates.
-
-A benchmark in LeRobot is a set of [Gymnasium](https://gymnasium.farama.org/) environments that wrap a third-party simulator (like LIBERO or Meta-World) behind a standard `gym.Env` interface. The `lerobot-eval` CLI then runs evaluation uniformly across all benchmarks.
-
-## Existing benchmarks at a glance
-
-Before diving in, here is what is already integrated:
-
-| Benchmark      | Env file            | Config class       | Tasks               | Action dim   | Processor                    |
-| -------------- | ------------------- | ------------------ | ------------------- | ------------ | ---------------------------- |
-| LIBERO         | `envs/libero.py`    | `LiberoEnv`        | 130 across 5 suites | 7            | `LiberoProcessorStep`        |
-| Meta-World     | `envs/metaworld.py` | `MetaworldEnv`     | 50 (MT50)           | 4            | None                         |
-| IsaacLab Arena | Hub-hosted          | `IsaaclabArenaEnv` | Configurable        | Configurable | `IsaaclabArenaProcessorStep` |
-
-Use `src/lerobot/envs/libero.py` and `src/lerobot/envs/metaworld.py` as reference implementations.
-
-## How it all fits together
-
-### Data flow
-
-During evaluation, data moves through four stages:
-
-```
-1. gym.Env  ──→  raw observations (numpy dicts)
-
-2. Preprocessing  ──→  standard LeRobot keys + task description
-   (preprocess_observation in envs/utils.py, env.call("task_description"))
-
-3. Processors  ──→  env-specific then policy-specific transforms
-   (env_preprocessor, policy_preprocessor)
-
-4. Policy  ──→  select_action()  ──→  action tensor
-   then reverse: policy_postprocessor → env_postprocessor → numpy action → env.step()
-```
-
-Most benchmarks only need to care about stage 1 (producing observations in the right format) and optionally stage 3 (if env-specific transforms are needed).
-
-### Environment structure
-
-`make_env()` returns a nested dict of vectorized environments:
-
-```python
-dict[str, dict[int, gym.vector.VectorEnv]]
-#    ^suite       ^task_id
-```
-
-A single-task env (e.g. PushT) looks like `{"pusht": {0: vec_env}}`.
-A multi-task benchmark (e.g. LIBERO) looks like `{"libero_spatial": {0: vec0, 1: vec1, ...}, ...}`.
-
-### How evaluation runs
-
-All benchmarks are evaluated the same way by `lerobot-eval`:
-
-1. `make_env()` builds the nested `{suite: {task_id: VectorEnv}}` dict.
-2. `eval_policy_all()` iterates over every suite and task.
-3. For each task, it runs `n_episodes` rollouts via `rollout()`.
-4. Results are aggregated hierarchically: episode, task, suite, overall.
-5. Metrics include `pc_success` (success rate), `avg_sum_reward`, and `avg_max_reward`.
-
-The critical piece: your env must return `info["is_success"]` on every `step()` call. This is how the eval loop knows whether a task was completed.
-
-## What your environment must provide
-
-LeRobot does not enforce a strict observation schema. Instead it relies on a set of conventions that all benchmarks follow.
-
-### Env attributes
-
-Your `gym.Env` must set these attributes:
-
-| Attribute            | Type  | Why                                                  |
-| -------------------- | ----- | ---------------------------------------------------- |
-| `_max_episode_steps` | `int` | `rollout()` uses this to cap episode length          |
-| `task_description`   | `str` | Passed to VLA policies as a language instruction     |
-| `task`               | `str` | Fallback identifier if `task_description` is not set |
-
-### Success reporting
-
-Your `step()` and `reset()` must include `"is_success"` in the `info` dict:
-
-```python
-info = {"is_success": True}   # or False
-return observation, reward, terminated, truncated, info
-```
-
-### Observations
-
-The simplest approach is to map your simulator's outputs to the standard keys that `preprocess_observation()` already understands. Do this inside your `gym.Env` (e.g. in a `_format_raw_obs()` helper):
-
-| Your env should output    | LeRobot maps it to         | What it is                            |
-| ------------------------- | -------------------------- | ------------------------------------- |
-| `"pixels"` (single array) | `observation.image`        | Single camera image, HWC uint8        |
-| `"pixels"` (dict)         | `observation.images.<cam>` | Multiple cameras, each HWC uint8      |
-| `"agent_pos"`             | `observation.state`        | Proprioceptive state vector           |
-| `"environment_state"`     | `observation.env_state`    | Full environment state (e.g. PushT)   |
-| `"robot_state"`           | `observation.robot_state`  | Nested robot state dict (e.g. LIBERO) |
-
-If your simulator uses different key names, you have two options:
-
-1. **Recommended:** Rename them to the standard keys inside your `gym.Env` wrapper.
-2. **Alternative:** Write an env processor to transform observations after `preprocess_observation()` runs (see step 4 below).
-
-### Actions
-
-Actions are continuous numpy arrays in a `gym.spaces.Box`. The dimensionality depends on your benchmark (7 for LIBERO, 4 for Meta-World, etc.). Policies adapt to different action dimensions through their `input_features` / `output_features` config.
-
-### Feature declaration
-
-Each `EnvConfig` subclass declares two dicts that tell the policy what to expect:
-
-- `features` — maps feature names to `PolicyFeature(type, shape)` (e.g. action dim, image shape).
-- `features_map` — maps raw observation keys to LeRobot convention keys (e.g. `"agent_pos"` to `"observation.state"`).
-
-## Step by step
-
-<Tip>
-  At minimum, you need two files: a **gym.Env wrapper** and an **EnvConfig
-  subclass** with a `create_envs()` override. Everything else is optional or
-  documentation. No changes to `factory.py` are needed.
-</Tip>
-
-### Checklist
-
-| File                                      | Required | Why                                                          |
-| ----------------------------------------- | -------- | ------------------------------------------------------------ |
-| `src/lerobot/envs/<benchmark>.py`         | Yes      | Wraps the simulator as a standard gym.Env                    |
-| `src/lerobot/envs/configs.py`             | Yes      | Registers your benchmark and its `create_envs()` for the CLI |
-| `src/lerobot/processor/env_processor.py`  | Optional | Custom observation/action transforms                         |
-| `src/lerobot/envs/utils.py`               | Optional | Only if you need new raw observation keys                    |
-| `pyproject.toml`                          | Yes      | Declares benchmark-specific dependencies                     |
-| `docs/source/<benchmark>.mdx`             | Yes      | User-facing documentation page                               |
-| `docs/source/_toctree.yml`                | Yes      | Adds your page to the docs sidebar                           |
-| `docker/Dockerfile.benchmark.<benchmark>` | Yes      | Isolated Docker image for CI smoke tests                     |
-| `.github/workflows/benchmark_tests.yml`   | Yes      | CI job that builds the image and runs a 1-episode smoke eval |
-
-### 1. The gym.Env wrapper (`src/lerobot/envs/<benchmark>.py`)
-
-Create a `gym.Env` subclass that wraps the third-party simulator:
-
-```python
-class MyBenchmarkEnv(gym.Env):
-    metadata = {"render_modes": ["rgb_array"], "render_fps": <fps>}
-
-    def __init__(self, task_suite, task_id, ...):
-        super().__init__()
-        self.task = <task_name_string>
-        self.task_description = <natural_language_instruction>
-        self._max_episode_steps = <max_steps>
-        self.observation_space = spaces.Dict({...})
-        self.action_space = spaces.Box(low=..., high=..., shape=(...,), dtype=np.float32)
-
-    def reset(self, seed=None, **kwargs):
-        ...  # return (observation, info) — info must contain {"is_success": False}
-
-    def step(self, action: np.ndarray):
-        ...  # return (obs, reward, terminated, truncated, info) — info must contain {"is_success": <bool>}
-
-    def render(self):
-        ...  # return RGB image as numpy array
-
-    def close(self):
-        ...
-```
-
-**GPU-based simulators (e.g. MuJoCo with EGL rendering):** If your simulator allocates GPU/EGL contexts during `__init__`, defer that allocation to a `_ensure_env()` helper called on first `reset()`/`step()`. This avoids inheriting stale GPU handles when `AsyncVectorEnv` spawns worker processes. See `LiberoEnv._ensure_env()` for the pattern.
-
-Also provide a factory function that returns the nested dict structure:
-
-```python
-def create_mybenchmark_envs(
-    task: str,
-    n_envs: int,
-    gym_kwargs: dict | None = None,
-    env_cls: type | None = None,
-) -> dict[str, dict[int, Any]]:
-    """Create {suite_name: {task_id: VectorEnv}} for MyBenchmark."""
-    ...
-```
-
-See `create_libero_envs()` (multi-suite, multi-task) and `create_metaworld_envs()` (difficulty-grouped tasks) for reference.
-
-### 2. The config (`src/lerobot/envs/configs.py`)
-
-Register a config dataclass so users can select your benchmark with `--env.type=<name>`. Each config owns its environment creation and processor logic via two methods:
-
-- **`create_envs(n_envs, use_async_envs)`** — Returns `{suite: {task_id: VectorEnv}}`. The base class default uses `gym.make()` for single-task envs. Multi-task benchmarks override this.
-- **`get_env_processors()`** — Returns `(preprocessor, postprocessor)`. The base class default returns identity (no-op) pipelines. Override if your benchmark needs observation/action transforms.
-
-```python
-@EnvConfig.register_subclass("<benchmark_name>")
-@dataclass
-class MyBenchmarkEnvConfig(EnvConfig):
-    task: str = "<default_task>"
-    fps: int = <fps>
-    obs_type: str = "pixels_agent_pos"
-
-    features: dict[str, PolicyFeature] = field(default_factory=lambda: {
-        ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(<action_dim>,)),
-    })
-    features_map: dict[str, str] = field(default_factory=lambda: {
-        ACTION: ACTION,
-        "agent_pos": OBS_STATE,
-        "pixels": OBS_IMAGE,
-    })
-
-    def __post_init__(self):
-        ...  # populate features based on obs_type
-
-    @property
-    def gym_kwargs(self) -> dict:
-        return {"obs_type": self.obs_type, "render_mode": self.render_mode}
-
-    def create_envs(self, n_envs: int, use_async_envs: bool = True):
-        """Override for multi-task benchmarks or custom env creation."""
-        from lerobot.envs.<benchmark> import create_<benchmark>_envs
-        return create_<benchmark>_envs(task=self.task, n_envs=n_envs, ...)
-
-    def get_env_processors(self):
-        """Override if your benchmark needs observation/action transforms."""
-        from lerobot.processor.pipeline import PolicyProcessorPipeline
-        from lerobot.processor.env_processor import MyBenchmarkProcessorStep
-        return (
-            PolicyProcessorPipeline(steps=[MyBenchmarkProcessorStep()]),
-            PolicyProcessorPipeline(steps=[]),
-        )
-```
-
-Key points:
-
-- The `register_subclass` name is what users pass on the CLI (`--env.type=<name>`).
-- `features` tells the policy what the environment produces.
-- `features_map` maps raw observation keys to LeRobot convention keys.
-- **No changes to `factory.py` needed** — the factory delegates to `cfg.create_envs()` and `cfg.get_env_processors()` automatically.
-
-### 3. Env processor (optional — `src/lerobot/processor/env_processor.py`)
-
-Only needed if your benchmark requires observation transforms beyond what `preprocess_observation()` handles (e.g. image flipping, coordinate conversion). Define the processor step here and return it from `get_env_processors()` in your config (see step 2):
-
-```python
-@dataclass
-@ProcessorStepRegistry.register(name="<benchmark>_processor")
-class MyBenchmarkProcessorStep(ObservationProcessorStep):
-    def _process_observation(self, observation):
-        processed = observation.copy()
-        # your transforms here
-        return processed
-
-    def transform_features(self, features):
-        return features  # update if shapes change
-
-    def observation(self, observation):
-        return self._process_observation(observation)
-```
-
-See `LiberoProcessorStep` for a full example (image rotation, quaternion-to-axis-angle conversion).
-
-### 4. Dependencies (`pyproject.toml`)
-
-Add a new optional-dependency group:
-
-```toml
-mybenchmark = ["my-benchmark-pkg==1.2.3", "lerobot[scipy-dep]"]
-```
-
-Pinning rules:
-
-- **Always pin** benchmark packages to exact versions for reproducibility (e.g. `metaworld==3.0.0`).
-- **Add platform markers** when needed (e.g. `; sys_platform == 'linux'`).
-- **Pin fragile transitive deps** if known (e.g. `gymnasium==1.1.0` for Meta-World).
-- **Document constraints** in your benchmark doc page.
-
-Users install with:
-
-```bash
-pip install -e ".[mybenchmark]"
-```
-
-### 5. Documentation (`docs/source/<benchmark>.mdx`)
-
-Write a user-facing page following the template in the next section. See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for full examples.
-
-### 6. Table of contents (`docs/source/_toctree.yml`)
-
-Add your benchmark to the "Benchmarks" section:
-
-```yaml
-- sections:
-    - local: libero
-      title: LIBERO
-    - local: metaworld
-      title: Meta-World
-    - local: envhub_isaaclab_arena
-      title: NVIDIA IsaacLab Arena Environments
-    - local: <your_benchmark>
-      title: <Your Benchmark Name>
-  title: "Benchmarks"
-```
-
-### 7. CI smoke test (`docker/` + `.github/workflows/benchmark_tests.yml`)
-
-Each benchmark must have an isolated Docker image and a CI job that runs a 1-episode eval. This catches install-time regressions (broken transitive deps, import errors, interactive prompts) before they reach users.
-
-**Create `docker/Dockerfile.benchmark.<benchmark>`** — copy an existing one and change only the extra name:
-
-```dockerfile
-# Isolated benchmark image — installs lerobot[<benchmark>] only.
-# Build: docker build -f docker/Dockerfile.benchmark.<benchmark> -t lerobot-benchmark-<benchmark> .
-ARG CUDA_VERSION=12.4.1
-ARG OS_VERSION=22.04
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
-ARG PYTHON_VERSION=3.12
-# ... (same system deps as Dockerfile.benchmark.libero) ...
-RUN uv sync --locked --extra <benchmark> --no-cache
-```
-
-Each benchmark gets its own image so its dependency tree (pinned simulator packages, specific mujoco/scipy versions) cannot conflict with other benchmarks.
-
-**Add a job to `.github/workflows/benchmark_tests.yml`** — copy an existing job block and adjust:
-
-```yaml
-<benchmark>-integration-test:
-  name: <Benchmark> — build image + 1-episode eval
-  runs-on:
-    group: aws-g6-4xlarge-plus
-  env:
-    HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
-  steps:
-    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      with:
-        persist-credentials: false
-        lfs: true
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
-      with:
-        cache-binary: false
-    - name: Build <Benchmark> image
-      uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
-      with:
-        context: .
-        file: docker/Dockerfile.benchmark.<benchmark>
-        push: false
-        load: true
-        tags: lerobot-benchmark-<benchmark>:ci
-        cache-from: type=local,src=/tmp/.buildx-cache-<benchmark>
-        cache-to: type=local,dest=/tmp/.buildx-cache-<benchmark>,mode=max
-    - name: Run <Benchmark> smoke eval (1 episode)
-      run: |
-        docker run --rm --gpus all \
-          --shm-size=4g \
-          -e HF_HOME=/tmp/hf \
-          -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-          lerobot-benchmark-<benchmark>:ci \
-          bash -c "
-            hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
-            lerobot-eval \
-              --policy.path=<hub_policy_path> \
-              --env.type=<benchmark> \
-              --env.task=<task> \
-              --eval.batch_size=1 \
-              --eval.n_episodes=1 \
-              --eval.use_async_envs=false \
-              --policy.device=cuda
-          "
-```
-
-**Tips:**
-
-- If the benchmark library prompts for user input on import (like LIBERO asking for a dataset folder), pass the relevant env var in the `docker run` command (e.g. `-e LIBERO_DATA_FOLDER=/tmp/libero_data`).
-- The job is scoped to only trigger on changes to `src/lerobot/envs/**`, `src/lerobot/scripts/lerobot_eval.py`, and the Dockerfiles — it won't run on unrelated PRs.
-
-## Verifying your integration
-
-After completing the steps above, confirm that everything works:
-
-1. **Install** — `pip install -e ".[mybenchmark]"` and verify the dependency group installs cleanly.
-2. **Smoke test env creation** — call `make_env()` with your config in Python, check that the returned dict has the expected `{suite: {task_id: VectorEnv}}` shape, and that `reset()` returns observations with the right keys.
-3. **Run a full eval** — `lerobot-eval --env.type=<name> --env.task=<task> --eval.n_episodes=1 --policy.path=<any_compatible_policy>` to exercise the full pipeline end-to-end. (`batch_size` defaults to auto-tuning based on CPU cores; pass `--eval.batch_size=1` to force a single environment.)
-4. **Check success detection** — verify that `info["is_success"]` flips to `True` when the task is actually completed. This is what the eval loop uses to compute success rates.
-5. **Add CI smoke test** — follow step 7 above to add a Dockerfile and CI job. This ensures the install stays green as dependencies evolve.
-
-## Writing a benchmark doc page
-
-Each benchmark `.mdx` page should include:
-
-- **Title and description** — 1-2 paragraphs on what the benchmark tests and why it matters.
-- **Links** — paper, GitHub repo, project website (if available).
-- **Overview image or GIF.**
-- **Available tasks** — table of task suites with counts and brief descriptions.
-- **Installation** — `pip install -e ".[<benchmark>]"` plus any extra steps (env vars, system packages).
-- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` for reproducible results. `batch_size` defaults to auto; only specify it if needed. Include single-task and multi-task examples if applicable. See the [Evaluation guide](evaluation) for details.
-- **Policy inputs and outputs** — observation keys with shapes, action space description.
-- **Recommended evaluation episodes** — how many episodes per task is standard.
-- **Training** — example `lerobot-train` command.
-- **Reproducing published results** — link to pretrained model, eval command, results table (if available).
-
-See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for complete examples.
diff --git a/docs/source/evaluation.mdx b/docs/source/evaluation.mdx
deleted file mode 100644
index ecd0cc1d6..000000000
--- a/docs/source/evaluation.mdx
+++ /dev/null
@@ -1,162 +0,0 @@
-# Evaluation
-
-`lerobot-eval` runs a trained policy on a simulation benchmark and reports success rate, reward, and (optionally) episode videos. It handles environment creation, batched rollouts, and metric aggregation automatically.
-
-## Quick start
-
-Evaluate a Hub-hosted policy on LIBERO:
-
-```bash
-lerobot-eval \
-    --policy.path=pepijn223/smolvla_libero \
-    --env.type=libero \
-    --env.task=libero_spatial \
-    --eval.n_episodes=10 \
-    --policy.device=cuda
-```
-
-Evaluate a local checkpoint:
-
-```bash
-lerobot-eval \
-    --policy.path=outputs/train/act_pusht/checkpoints/005000/pretrained_model \
-    --env.type=pusht \
-    --eval.n_episodes=10
-```
-
-`batch_size` defaults to **auto** (based on CPU cores). The script picks the right number of parallel environments for your machine.
-
-## Key flags
-
-| Flag                    | Default        | Description                                                                           |
-| ----------------------- | -------------- | ------------------------------------------------------------------------------------- |
-| `--policy.path`         | required       | Hub repo ID or local path to a pretrained model                                       |
-| `--env.type`            | required       | Benchmark name (`pusht`, `libero`, `metaworld`, etc.)                                 |
-| `--env.task`            | varies         | Task or suite name (e.g. `libero_spatial`, `libero_10`)                               |
-| `--eval.n_episodes`     | `50`           | Total episodes to run (across all tasks)                                              |
-| `--eval.batch_size`     | `0` (auto)     | Number of parallel environments. `0` = auto-tune from CPU cores                       |
-| `--eval.use_async_envs` | `true`         | Use `AsyncVectorEnv` (parallel stepping). Auto-downgrades to sync when `batch_size=1` |
-| `--policy.device`       | `cuda`         | Inference device                                                                      |
-| `--policy.use_amp`      | `false`        | Mixed-precision inference (saves VRAM, faster on Ampere+)                             |
-| `--seed`                | `1000`         | Random seed for reproducibility                                                       |
-| `--output_dir`          | auto-generated | Where to write results and videos                                                     |
-
-### Environment-specific flags
-
-Some benchmarks accept additional flags through `--env.*`:
-
-```bash
-# LIBERO: map simulator camera names to policy feature names
---env.camera_name_mapping='{"agentview_image": "camera1", "robot0_eye_in_hand_image": "camera2"}'
-
-# Fill unused camera slots with zeros
---policy.empty_cameras=1
-```
-
-See each benchmark's documentation ([LIBERO](libero), [Meta-World](metaworld)) for benchmark-specific flags.
-
-## How batch_size works
-
-`batch_size` controls how many environments run in parallel within a single `VectorEnv`:
-
-| `batch_size`  | Behavior                                                             |
-| ------------- | -------------------------------------------------------------------- |
-| `0` (default) | Auto-tune: `floor(cpu_cores × 0.7)`, capped by `n_episodes` and `64` |
-| `1`           | Single environment, synchronous. Useful for debugging                |
-| `N`           | N environments step in parallel via `AsyncVectorEnv`                 |
-
-When `batch_size > 1` and `use_async_envs=true`, each environment runs in its own subprocess via Gymnasium's `AsyncVectorEnv`. This parallelizes the simulation stepping (the main bottleneck), while the policy runs a single batched forward pass on GPU.
-
-**Example:** On a 16-core machine with `n_episodes=100`:
-
-- Auto batch_size = `floor(16 × 0.7)` = `11`
-- 11 environments step simultaneously → ~11× faster than sequential
-
-## Performance
-
-### AsyncVectorEnv (default)
-
-`AsyncVectorEnv` spawns one subprocess per environment. Each subprocess has its own simulator instance. While the policy computes actions on GPU, all environments step in parallel on CPU:
-
-```
-GPU:  [inference]....[inference]....[inference]....
-CPU:  [step × N]....................[step × N]......
-      ↑ parallel                   ↑ parallel
-```
-
-For GPU-based simulators (LIBERO, Meta-World), the environments use **lazy initialization**: the GPU/EGL context is created inside the worker subprocess on first `reset()`, not in the parent process. This avoids `EGL_BAD_CONTEXT` crashes from inheriting stale GPU handles across `fork()`.
-
-### Lazy task loading
-
-For multi-task benchmarks (e.g. LIBERO with 10 tasks), environments are wrapped in `_LazyAsyncVectorEnv` which defers worker creation until the task is actually evaluated. This keeps peak process count = `batch_size` instead of `n_tasks × batch_size`. After each task completes, workers are closed to free resources.
-
-### Tuning for speed
-
-| Situation                      | Recommendation                                        |
-| ------------------------------ | ----------------------------------------------------- |
-| Slow eval, low GPU utilization | Increase `batch_size` (or leave at auto)              |
-| Out of memory (system RAM)     | Decrease `batch_size`                                 |
-| Out of GPU memory              | Decrease `batch_size`, or use `--policy.use_amp=true` |
-| Debugging / single-stepping    | `--eval.batch_size=1 --eval.use_async_envs=false`     |
-
-## Output
-
-Results are written to `output_dir` (default: `outputs/eval/<date>/<time>_<job_name>/`):
-
-- `eval_info.json` — full metrics: per-episode, per-task, per-group, and overall aggregates
-- `videos/` — episode recordings (when `--eval.n_episodes_to_render > 0`)
-
-### Metrics
-
-| Metric           | Description                                                          |
-| ---------------- | -------------------------------------------------------------------- |
-| `pc_success`     | Success rate (%). Based on `info["is_success"]` from the environment |
-| `avg_sum_reward` | Mean cumulative reward per episode                                   |
-| `avg_max_reward` | Mean peak reward per episode                                         |
-| `n_episodes`     | Total episodes evaluated                                             |
-| `eval_s`         | Total wall-clock time                                                |
-| `eval_ep_s`      | Mean wall-clock time per episode                                     |
-
-## Multi-task evaluation
-
-For benchmarks with multiple tasks (LIBERO suites, Meta-World MT50), `lerobot-eval` automatically:
-
-1. Creates environments for all tasks in the selected suite(s)
-2. Evaluates each task sequentially (one task's workers at a time)
-3. Aggregates metrics per-task, per-group (suite), and overall
-
-```bash
-# Evaluate all 10 tasks in libero_spatial
-lerobot-eval \
-    --policy.path=pepijn223/smolvla_libero \
-    --env.type=libero \
-    --env.task=libero_spatial \
-    --eval.n_episodes=10
-
-# Evaluate multiple suites
-lerobot-eval \
-    --policy.path=pepijn223/smolvla_libero \
-    --env.type=libero \
-    --env.task="libero_spatial,libero_object" \
-    --eval.n_episodes=10
-```
-
-## API usage
-
-You can call the eval functions directly from Python:
-
-```python
-from lerobot.envs.factory import make_env
-from lerobot.policies.factory import make_policy
-from lerobot.scripts.lerobot_eval import eval_policy
-
-envs = make_env(env_cfg, n_envs=10)
-policy = make_policy(cfg=policy_cfg, env_cfg=env_cfg)
-
-metrics = eval_policy(
-    env=envs["libero_spatial"][0],
-    policy=policy,
-    n_episodes=10,
-)
-print(metrics["pc_success"])
-```
diff --git a/tests/envs/test_dispatch.py b/tests/envs/test_dispatch.py
deleted file mode 100644
index 5bd2827f3..000000000
--- a/tests/envs/test_dispatch.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Tests for the benchmark dispatch refactor (create_envs / get_env_processors on EnvConfig)."""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-
-import gymnasium as gym
-import pytest
-from gymnasium.envs.registration import register, registry as gym_registry
-
-from lerobot.configs.types import PolicyFeature
-from lerobot.envs.configs import EnvConfig
-from lerobot.envs.factory import make_env, make_env_config, make_env_pre_post_processors
-
-logger = logging.getLogger(__name__)
-
-
-def test_registry_all_types():
-    """make_env_config should resolve every registered EnvConfig subclass via the registry."""
-    known = list(EnvConfig.get_known_choices().keys())
-    assert len(known) >= 6
-    for t in known:
-        cfg = make_env_config(t)
-        if not isinstance(cfg, EnvConfig):
-            continue
-        assert cfg.type == t
-
-
-def test_unknown_type():
-    with pytest.raises(ValueError, match="not registered"):
-        make_env_config("nonexistent")
-
-
-def test_identity_processors():
-    """Base class get_env_processors() returns identity pipelines."""
-    cfg = make_env_config("aloha")
-    pre, post = cfg.get_env_processors()
-    assert len(pre.steps) == 0 and len(post.steps) == 0
-
-
-def test_delegation():
-    """make_env() should call cfg.create_envs(), not use if/elif dispatch."""
-    sentinel = {"delegated": {0: "marker"}}
-    fake = type(
-        "Fake",
-        (),
-        {
-            "hub_path": None,
-            "create_envs": lambda self, n_envs, use_async_envs=False: sentinel,
-        },
-    )()
-    result = make_env(fake, n_envs=1)
-    assert result is sentinel
-
-
-def test_processors_delegation():
-    """make_env_pre_post_processors delegates to cfg.get_env_processors()."""
-    cfg = make_env_config("aloha")
-    pre, post = make_env_pre_post_processors(cfg, policy_cfg=None)
-    assert len(pre.steps) == 0
-
-
-def test_base_create_envs():
-    """Base class create_envs() should build a single-task VectorEnv via gym.make()."""
-    gym_id = "_dispatch_test/CartPole-v99"
-    if gym_id not in gym_registry:
-        register(id=gym_id, entry_point="gymnasium.envs.classic_control:CartPoleEnv")
-
-    @EnvConfig.register_subclass("_dispatch_base_test")
-    @dataclass
-    class _Env(EnvConfig):
-        task: str = "CartPole-v99"
-        fps: int = 10
-        features: dict[str, PolicyFeature] = field(default_factory=dict)
-
-        @property
-        def package_name(self):
-            return "_dispatch_test"
-
-        @property
-        def gym_id(self):
-            return gym_id
-
-        @property
-        def gym_kwargs(self):
-            return {}
-
-    try:
-        envs = _Env().create_envs(n_envs=2)
-        assert "_dispatch_base_test" in envs
-        env = envs["_dispatch_base_test"][0]
-        assert isinstance(env, gym.vector.VectorEnv)
-        assert env.num_envs == 2
-        env.close()
-    finally:
-        if gym_id in gym_registry:
-            del gym_registry[gym_id]
-
-
-def test_custom_create_envs_override():
-    """A custom EnvConfig subclass can override create_envs()."""
-    mock_vec = gym.vector.SyncVectorEnv([lambda: gym.make("CartPole-v1")])
-
-    @EnvConfig.register_subclass("_dispatch_custom_test")
-    @dataclass
-    class _Env(EnvConfig):
-        task: str = "x"
-        features: dict[str, PolicyFeature] = field(default_factory=dict)
-
-        @property
-        def gym_kwargs(self):
-            return {}
-
-        def create_envs(self, n_envs, use_async_envs=False):
-            return {"custom_suite": {0: mock_vec}}
-
-    try:
-        result = make_env(_Env(), n_envs=1)
-        assert "custom_suite" in result
-    finally:
-        mock_vec.close()
-
-
-def test_custom_get_env_processors_override():
-    """A custom EnvConfig subclass can override get_env_processors()."""
-    from lerobot.processor.pipeline import DataProcessorPipeline
-
-    @EnvConfig.register_subclass("_dispatch_proc_test")
-    @dataclass
-    class _Env(EnvConfig):
-        task: str = "x"
-        features: dict[str, PolicyFeature] = field(default_factory=dict)
-
-        @property
-        def gym_kwargs(self):
-            return {}
-
-        def get_env_processors(self):
-            return DataProcessorPipeline(steps=[]), DataProcessorPipeline(steps=[])
-
-    pre, post = _Env().get_env_processors()
-    assert isinstance(pre, DataProcessorPipeline)