feat(ci): extract task descriptions and embed in metrics artifact

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
feat(ci): add Libero train+eval smoke test (1 step, eval_freq=1)
2026-05-14 08:09:45 +00:00 · 2026-04-09 12:50:14 +02:00 · 2026-04-09 10:26:42 +02:00 · 2026-04-09 10:04:53 +02:00 · 2026-04-08 20:56:35 +02:00 · 2026-04-08 20:33:39 +02:00
18 changed files with 1509 additions and 267 deletions
@@ -0,0 +1,309 @@
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Integration tests: build an isolated Docker image per benchmark and run a
 # 1-episode smoke eval. Each benchmark gets its own image so incompatible
 # dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide.
 #
 # To add a new benchmark:
 #   1. Add docker/Dockerfile.benchmark.<name>  (install only lerobot[<name>])
 #   2. Copy one of the jobs below and adjust the image name and eval command.
 name: Benchmark Integration Tests
 on:
  # Run manually from the Actions tab
  workflow_dispatch:
  # Run every Monday at 02:00 UTC.
  schedule:
    - cron: "0 2 * * 1"
  push:
    branches:
      - feat/benchmark-ci
      - main
    paths:
      - "src/lerobot/envs/**"
      - "src/lerobot/scripts/lerobot_eval.py"
      - "docker/Dockerfile.benchmark.*"
      - ".github/workflows/benchmark_tests.yml"
      - "pyproject.toml"
  pull_request:
    branches:
      - main
    paths:
      - "src/lerobot/envs/**"
      - "src/lerobot/scripts/lerobot_eval.py"
      - "docker/Dockerfile.benchmark.*"
      - ".github/workflows/benchmark_tests.yml"
      - "pyproject.toml"
 permissions:
  contents: read
 env:
  UV_VERSION: "0.8.0"
  PYTHON_VERSION: "3.12"
 # Cancel in-flight runs for the same branch/PR.
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  # ── LIBERO ────────────────────────────────────────────────────────────────
  # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain)
  libero-integration-test:
    name: Libero — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          cache-binary: false
      # Build the benchmark-specific image; layer cache lives in the runner's
      # local Docker daemon — reused across re-runs on the same machine.
      - name: Build Libero benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
          context: .
          file: docker/Dockerfile.benchmark.libero
          push: false
          load: true
          tags: lerobot-benchmark-libero:ci
          cache-from: type=local,src=/tmp/.buildx-cache-libero
          cache-to: type=local,dest=/tmp/.buildx-cache-libero,mode=max
      - name: Login to Hugging Face
        if: env.HF_USER_TOKEN != ''
        run: |
          docker run --rm \
            -e HF_HOME=/tmp/hf \
            lerobot-benchmark-libero:ci \
            bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami"
      - name: Run Libero smoke eval (1 episode)
        run: |
          # Named container (no --rm) so we can docker cp artifacts out.
          # Output to /tmp inside the container — user_lerobot cannot create
          # root-level dirs like /artifacts.
          docker run --name libero-eval --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
            lerobot-benchmark-libero:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
                --policy.path=pepijn223/smolvla_libero \
                --env.type=libero \
                --env.task=libero_spatial \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
                --policy.device=cuda \
                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
                --policy.empty_cameras=1 \
                --output_dir=/tmp/eval-artifacts
              python3 /lerobot/scripts/ci/extract_task_descriptions.py \
                --env libero --task libero_spatial \
                --output /tmp/eval-artifacts/task_descriptions.json 2>/dev/null || true
            "
      - name: Copy Libero artifacts from container
        if: always()
        run: |
          mkdir -p /tmp/libero-artifacts
          docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true
          docker rm -f libero-eval || true
      - name: Parse Libero eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/libero-artifacts \
            --env libero \
            --task libero_spatial \
            --policy pepijn223/smolvla_libero
      - name: Upload Libero rollout video
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-rollout-video
          path: /tmp/libero-artifacts/videos/
          if-no-files-found: warn
      - name: Upload Libero eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-metrics
          path: /tmp/libero-artifacts/metrics.json
          if-no-files-found: warn
      # ── LIBERO TRAIN+EVAL SMOKE ──────────────────────────────────────────────
      # Train SmolVLA for 1 step (batch_size=1, dataset episode 0 only) then
      # immediately runs eval inside the training loop (eval_freq=1, 1 episode).
      # Tests the full train→eval-within-training pipeline end-to-end.
      - name: Run Libero train+eval smoke (1 step, eval_freq=1)
        run: |
          docker run --name libero-train-smoke --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
            lerobot-benchmark-libero:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              accelerate launch --num_processes=1 \$(which lerobot-train) \
                --policy.path=lerobot/smolvla_base \
                --policy.load_vlm_weights=true \
                --policy.scheduler_decay_steps=25000 \
                --policy.freeze_vision_encoder=false \
                --policy.train_expert_only=false \
                --dataset.repo_id=lerobot/libero \
                --dataset.episodes=[0] \
                --dataset.use_imagenet_stats=false \
                --env.type=libero \
                --env.task=libero_spatial \
                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
                --policy.empty_cameras=1 \
                --output_dir=/tmp/train-smoke \
                --steps=1 \
                --batch_size=1 \
                --eval_freq=1 \
                --eval.n_episodes=1 \
                --eval.batch_size=1 \
                --eval.use_async_envs=false \
                --save_freq=1 \
                --policy.push_to_hub=false \
                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.image2\": \"observation.images.camera2\"}'
            "
      - name: Copy Libero train-smoke artifacts from container
        if: always()
        run: |
          mkdir -p /tmp/libero-train-smoke-artifacts
          docker cp libero-train-smoke:/tmp/train-smoke/. /tmp/libero-train-smoke-artifacts/ 2>/dev/null || true
          docker rm -f libero-train-smoke || true
      - name: Upload Libero train-smoke eval video
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-train-smoke-video
          path: /tmp/libero-train-smoke-artifacts/eval/
          if-no-files-found: warn
  # ── METAWORLD ─────────────────────────────────────────────────────────────
  # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
  metaworld-integration-test:
    name: MetaWorld — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          cache-binary: false
      - name: Build MetaWorld benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
          context: .
          file: docker/Dockerfile.benchmark.metaworld
          push: false
          load: true
          tags: lerobot-benchmark-metaworld:ci
          cache-from: type=local,src=/tmp/.buildx-cache-metaworld
          cache-to: type=local,dest=/tmp/.buildx-cache-metaworld,mode=max
      - name: Run MetaWorld smoke eval (1 episode)
        run: |
          docker run --name metaworld-eval --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
            lerobot-benchmark-metaworld:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
                --policy.path=pepijn223/smolvla_metaworld \
                --env.type=metaworld \
                --env.task=metaworld-push-v3 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
                --policy.device=cuda \
                '--rename_map={\"observation.image\": \"observation.images.camera1\"}' \
                --policy.empty_cameras=2 \
                --output_dir=/tmp/eval-artifacts
              python3 /lerobot/scripts/ci/extract_task_descriptions.py \
                --env metaworld --task metaworld-push-v3 \
                --output /tmp/eval-artifacts/task_descriptions.json 2>/dev/null || true
            "
      - name: Copy MetaWorld artifacts from container
        if: always()
        run: |
          mkdir -p /tmp/metaworld-artifacts
          docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true
          docker rm -f metaworld-eval || true
      - name: Parse MetaWorld eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/metaworld-artifacts \
            --env metaworld \
            --task metaworld-push-v3 \
            --policy pepijn223/smolvla_metaworld
      - name: Upload MetaWorld rollout video
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: metaworld-rollout-video
          path: /tmp/metaworld-artifacts/videos/
          if-no-files-found: warn
      - name: Upload MetaWorld eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn
@@ -0,0 +1,89 @@
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Isolated benchmark image for LIBERO integration tests.
 # Installs only lerobot[libero] so its dep tree (hf-libero, dm-control, mujoco)
 # cannot conflict with other benchmarks.
 #
 # Build:  docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero .
 # Run:    docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ...
 ARG CUDA_VERSION=12.4.1
 ARG OS_VERSION=22.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
 ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive \
    MUJOCO_GL=egl \
    PATH=/lerobot/.venv/bin:$PATH \
    CUDA_VISIBLE_DEVICES=0 \
    DEVICE=cuda
 # System deps — same set as Dockerfile.internal
 RUN apt-get update && apt-get install -y --no-install-recommends \
    software-properties-common build-essential git curl \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
    cmake pkg-config ninja-build \
    && add-apt-repository -y ppa:deadsnakes/ppa \
    && apt-get update \
    && apt-get install -y --no-install-recommends \
       python${PYTHON_VERSION} \
       python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-dev \
    && curl -LsSf https://astral.sh/uv/install.sh | sh \
    && mv /root/.local/bin/uv /usr/local/bin/uv \
    && useradd --create-home --shell /bin/bash user_lerobot \
    && usermod -aG sudo user_lerobot \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 WORKDIR /lerobot
 RUN chown -R user_lerobot:user_lerobot /lerobot
 USER user_lerobot
 ENV HOME=/home/user_lerobot \
    HF_HOME=/home/user_lerobot/.cache/huggingface \
    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
    TORCH_HOME=/home/user_lerobot/.cache/torch \
    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
 RUN uv venv --python python${PYTHON_VERSION}
 # Install only lerobot[libero] — completely isolated from metaworld's dep tree
 COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
 COPY --chown=user_lerobot:user_lerobot src/ src/
 RUN uv sync --locked --extra libero --extra smolvla --no-cache
 # Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
 # runtime (which times out on CI). Point the libero config at the cached path.
 # libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
 # so we write the config before any libero import can happen.
 RUN LIBERO_DIR=$(python${PYTHON_VERSION} -c \
      "import importlib.util, os; s=importlib.util.find_spec('libero'); \
       print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
    mkdir -p /home/user_lerobot/.libero && \
    python${PYTHON_VERSION} -c "\
 from huggingface_hub import snapshot_download; \
 snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
                  local_dir='/home/user_lerobot/.libero/assets')" && \
    printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
    > /home/user_lerobot/.libero/config.yaml
 RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
 COPY --chown=user_lerobot:user_lerobot . .
 CMD ["/bin/bash"]
@@ -0,0 +1,74 @@
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Isolated benchmark image for MetaWorld integration tests.
 # Installs only lerobot[metaworld] so its dep tree (metaworld==3.0.0, mujoco>=3)
 # cannot conflict with other benchmarks.
 #
 # Build:  docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld .
 # Run:    docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ...
 ARG CUDA_VERSION=12.4.1
 ARG OS_VERSION=22.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
 ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive \
    MUJOCO_GL=egl \
    PATH=/lerobot/.venv/bin:$PATH \
    CUDA_VISIBLE_DEVICES=0 \
    DEVICE=cuda
 # System deps — same set as Dockerfile.internal
 RUN apt-get update && apt-get install -y --no-install-recommends \
    software-properties-common build-essential git curl \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
    cmake pkg-config ninja-build \
    && add-apt-repository -y ppa:deadsnakes/ppa \
    && apt-get update \
    && apt-get install -y --no-install-recommends \
       python${PYTHON_VERSION} \
       python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-dev \
    && curl -LsSf https://astral.sh/uv/install.sh | sh \
    && mv /root/.local/bin/uv /usr/local/bin/uv \
    && useradd --create-home --shell /bin/bash user_lerobot \
    && usermod -aG sudo user_lerobot \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 WORKDIR /lerobot
 RUN chown -R user_lerobot:user_lerobot /lerobot
 USER user_lerobot
 ENV HOME=/home/user_lerobot \
    HF_HOME=/home/user_lerobot/.cache/huggingface \
    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
    TORCH_HOME=/home/user_lerobot/.cache/torch \
    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
 RUN uv venv --python python${PYTHON_VERSION}
 # Install only lerobot[metaworld] — completely isolated from libero's dep tree
 COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
 COPY --chown=user_lerobot:user_lerobot src/ src/
 RUN uv sync --locked --extra metaworld --extra smolvla --no-cache
 RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
 COPY --chown=user_lerobot:user_lerobot . .
 CMD ["/bin/bash"]
@@ -73,6 +73,8 @@
    title: Control & Train Robots in Sim (LeIsaac)
  title: "Simulation"
 - sections:
  - local: evaluation
    title: Evaluation (lerobot-eval)
  - local: adding_benchmarks
    title: Adding a New Benchmark
  - local: libero
@@ -26,7 +26,7 @@ During evaluation, data moves through four stages:
 1. gym.Env  ──→  raw observations (numpy dicts)
 2. Preprocessing  ──→  standard LeRobot keys + task description
-   (preprocess_observation, add_envs_task in envs/utils.py)
+   (preprocess_observation in envs/utils.py, env.call("task_description"))
 3. Processors  ──→  env-specific then policy-specific transforms
   (env_preprocessor, policy_preprocessor)
@@ -115,23 +115,24 @@ Each `EnvConfig` subclass declares two dicts that tell the policy what to expect
 ## Step by step
 <Tip>
-  At minimum, you need three files: a **gym.Env wrapper**, an **EnvConfig
+  At minimum, you need two files: a **gym.Env wrapper** and an **EnvConfig
-  subclass**, and a **factory dispatch branch**. Everything else is optional or
+  subclass** with a `create_envs()` override. Everything else is optional or
-  documentation.
+  documentation. No changes to `factory.py` are needed.
 </Tip>
 ### Checklist
-| File                                     | Required | Why                                       |
+| File                                      | Required | Why                                                          |
-| ---------------------------------------- | -------- | ----------------------------------------- |
+| ----------------------------------------- | -------- | ------------------------------------------------------------ |
-| `src/lerobot/envs/<benchmark>.py`        | Yes      | Wraps the simulator as a standard gym.Env |
+| `src/lerobot/envs/<benchmark>.py`         | Yes      | Wraps the simulator as a standard gym.Env                    |
-| `src/lerobot/envs/configs.py`            | Yes      | Registers your benchmark for the CLI      |
+| `src/lerobot/envs/configs.py`             | Yes      | Registers your benchmark and its `create_envs()` for the CLI |
-| `src/lerobot/envs/factory.py`            | Yes      | Tells `make_env()` how to build your envs |
+| `src/lerobot/processor/env_processor.py`  | Optional | Custom observation/action transforms                         |
-| `src/lerobot/processor/env_processor.py` | Optional | Custom observation/action transforms      |
+| `src/lerobot/envs/utils.py`               | Optional | Only if you need new raw observation keys                    |
-| `src/lerobot/envs/utils.py`              | Optional | Only if you need new raw observation keys |
+| `pyproject.toml`                          | Yes      | Declares benchmark-specific dependencies                     |
-| `pyproject.toml`                         | Yes      | Declares benchmark-specific dependencies  |
+| `docs/source/<benchmark>.mdx`             | Yes      | User-facing documentation page                               |
-| `docs/source/<benchmark>.mdx`            | Yes      | User-facing documentation page            |
+| `docs/source/_toctree.yml`                | Yes      | Adds your page to the docs sidebar                           |
-| `docs/source/_toctree.yml`               | Yes      | Adds your page to the docs sidebar        |
+| `docker/Dockerfile.benchmark.<benchmark>` | Yes      | Isolated Docker image for CI smoke tests                     |
 | `.github/workflows/benchmark_tests.yml`   | Yes      | CI job that builds the image and runs a 1-episode smoke eval |
 ### 1. The gym.Env wrapper (`src/lerobot/envs/<benchmark>.py`)
@@ -162,6 +163,8 @@ class MyBenchmarkEnv(gym.Env):
        ...
 ```
 **GPU-based simulators (e.g. MuJoCo with EGL rendering):** If your simulator allocates GPU/EGL contexts during `__init__`, defer that allocation to a `_ensure_env()` helper called on first `reset()`/`step()`. This avoids inheriting stale GPU handles when `AsyncVectorEnv` spawns worker processes. See `LiberoEnv._ensure_env()` for the pattern.
 Also provide a factory function that returns the nested dict structure:
 ```python
@@ -179,7 +182,10 @@ See `create_libero_envs()` (multi-suite, multi-task) and `create_metaworld_envs(
 ### 2. The config (`src/lerobot/envs/configs.py`)
-Register a config dataclass so users can select your benchmark with `--env.type=<name>`:
+Register a config dataclass so users can select your benchmark with `--env.type=<name>`. Each config owns its environment creation and processor logic via two methods:
 - **`create_envs(n_envs, use_async_envs)`** — Returns `{suite: {task_id: VectorEnv}}`. The base class default uses `gym.make()` for single-task envs. Multi-task benchmarks override this.
 - **`get_env_processors()`** — Returns `(preprocessor, postprocessor)`. The base class default returns identity (no-op) pipelines. Override if your benchmark needs observation/action transforms.
 ```python
@EnvConfig.register_subclass("<benchmark_name>")
@@ -204,6 +210,20 @@ class MyBenchmarkEnvConfig(EnvConfig):
    @property
    def gym_kwargs(self) -> dict:
        return {"obs_type": self.obs_type, "render_mode": self.render_mode}
    def create_envs(self, n_envs: int, use_async_envs: bool = True):
        """Override for multi-task benchmarks or custom env creation."""
        from lerobot.envs.<benchmark> import create_<benchmark>_envs
        return create_<benchmark>_envs(task=self.task, n_envs=n_envs, ...)
    def get_env_processors(self):
        """Override if your benchmark needs observation/action transforms."""
        from lerobot.processor.pipeline import PolicyProcessorPipeline
        from lerobot.processor.env_processor import MyBenchmarkProcessorStep
        return (
            PolicyProcessorPipeline(steps=[MyBenchmarkProcessorStep()]),
            PolicyProcessorPipeline(steps=[]),
        )
 ```
 Key points:
@@ -211,36 +231,11 @@ Key points:
 - The `register_subclass` name is what users pass on the CLI (`--env.type=<name>`).
 - `features` tells the policy what the environment produces.
 - `features_map` maps raw observation keys to LeRobot convention keys.
 - **No changes to `factory.py` needed** — the factory delegates to `cfg.create_envs()` and `cfg.get_env_processors()` automatically.
-### 3. The factory dispatch (`src/lerobot/envs/factory.py`)
+### 3. Env processor (optional — `src/lerobot/processor/env_processor.py`)
-Add a branch in `make_env()` to call your factory function:
+Only needed if your benchmark requires observation transforms beyond what `preprocess_observation()` handles (e.g. image flipping, coordinate conversion). Define the processor step here and return it from `get_env_processors()` in your config (see step 2):
 ```python
 elif "<benchmark_name>" in cfg.type:
    from lerobot.envs.<benchmark> import create_<benchmark>_envs
    if cfg.task is None:
        raise ValueError("<BenchmarkName> requires a task to be specified")
    return create_<benchmark>_envs(
        task=cfg.task,
        n_envs=n_envs,
        gym_kwargs=cfg.gym_kwargs,
        env_cls=env_cls,
    )
 ```
 If your benchmark needs an env processor, add it in `make_env_pre_post_processors()`:
 ```python
 if isinstance(env_cfg, MyBenchmarkEnvConfig) or "<benchmark_name>" in env_cfg.type:
    preprocessor_steps.append(MyBenchmarkProcessorStep())
 ```
 ### 4. Env processor (optional — `src/lerobot/processor/env_processor.py`)
 Only needed if your benchmark requires observation transforms beyond what `preprocess_observation()` handles (e.g. image flipping, coordinate conversion):
 ```python
@dataclass
@@ -260,7 +255,7 @@ class MyBenchmarkProcessorStep(ObservationProcessorStep):
 See `LiberoProcessorStep` for a full example (image rotation, quaternion-to-axis-angle conversion).
-### 5. Dependencies (`pyproject.toml`)
+### 4. Dependencies (`pyproject.toml`)
 Add a new optional-dependency group:
@@ -281,11 +276,11 @@ Users install with:
 pip install -e ".[mybenchmark]"
 ```
-### 6. Documentation (`docs/source/<benchmark>.mdx`)
+### 5. Documentation (`docs/source/<benchmark>.mdx`)
 Write a user-facing page following the template in the next section. See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for full examples.
-### 7. Table of contents (`docs/source/_toctree.yml`)
+### 6. Table of contents (`docs/source/_toctree.yml`)
 Add your benchmark to the "Benchmarks" section:
@@ -302,14 +297,87 @@ Add your benchmark to the "Benchmarks" section:
  title: "Benchmarks"
 ```
 ### 7. CI smoke test (`docker/` + `.github/workflows/benchmark_tests.yml`)
 Each benchmark must have an isolated Docker image and a CI job that runs a 1-episode eval. This catches install-time regressions (broken transitive deps, import errors, interactive prompts) before they reach users.
 **Create `docker/Dockerfile.benchmark.<benchmark>`** — copy an existing one and change only the extra name:
 ```dockerfile
 # Isolated benchmark image — installs lerobot[<benchmark>] only.
 # Build: docker build -f docker/Dockerfile.benchmark.<benchmark> -t lerobot-benchmark-<benchmark> .
 ARG CUDA_VERSION=12.4.1
 ARG OS_VERSION=22.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
 ARG PYTHON_VERSION=3.12
 # ... (same system deps as Dockerfile.benchmark.libero) ...
 RUN uv sync --locked --extra <benchmark> --no-cache
 ```
 Each benchmark gets its own image so its dependency tree (pinned simulator packages, specific mujoco/scipy versions) cannot conflict with other benchmarks.
 **Add a job to `.github/workflows/benchmark_tests.yml`** — copy an existing job block and adjust:
 ```yaml
 <benchmark>-integration-test:
  name: <Benchmark> — build image + 1-episode eval
  runs-on:
    group: aws-g6-4xlarge-plus
  env:
    HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
  steps:
    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      with:
        persist-credentials: false
        lfs: true
    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
      with:
        cache-binary: false
    - name: Build <Benchmark> image
      uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
      with:
        context: .
        file: docker/Dockerfile.benchmark.<benchmark>
        push: false
        load: true
        tags: lerobot-benchmark-<benchmark>:ci
        cache-from: type=local,src=/tmp/.buildx-cache-<benchmark>
        cache-to: type=local,dest=/tmp/.buildx-cache-<benchmark>,mode=max
    - name: Run <Benchmark> smoke eval (1 episode)
      run: |
        docker run --rm --gpus all \
          --shm-size=4g \
          -e HF_HOME=/tmp/hf \
          -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
          lerobot-benchmark-<benchmark>:ci \
          bash -c "
            hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
            lerobot-eval \
              --policy.path=<hub_policy_path> \
              --env.type=<benchmark> \
              --env.task=<task> \
              --eval.batch_size=1 \
              --eval.n_episodes=1 \
              --eval.use_async_envs=false \
              --policy.device=cuda
          "
 ```
 **Tips:**
 - If the benchmark library prompts for user input on import (like LIBERO asking for a dataset folder), pass the relevant env var in the `docker run` command (e.g. `-e LIBERO_DATA_FOLDER=/tmp/libero_data`).
 - The job is scoped to only trigger on changes to `src/lerobot/envs/**`, `src/lerobot/scripts/lerobot_eval.py`, and the Dockerfiles — it won't run on unrelated PRs.
 ## Verifying your integration
 After completing the steps above, confirm that everything works:
 1. **Install** — `pip install -e ".[mybenchmark]"` and verify the dependency group installs cleanly.
 2. **Smoke test env creation** — call `make_env()` with your config in Python, check that the returned dict has the expected `{suite: {task_id: VectorEnv}}` shape, and that `reset()` returns observations with the right keys.
-3. **Run a full eval** — `lerobot-eval --env.type=<name> --env.task=<task> --eval.n_episodes=1 --eval.batch_size=1 --policy.path=<any_compatible_policy>` to exercise the full pipeline end-to-end.
+3. **Run a full eval** — `lerobot-eval --env.type=<name> --env.task=<task> --eval.n_episodes=1 --policy.path=<any_compatible_policy>` to exercise the full pipeline end-to-end. (`batch_size` defaults to auto-tuning based on CPU cores; pass `--eval.batch_size=1` to force a single environment.)
 4. **Check success detection** — verify that `info["is_success"]` flips to `True` when the task is actually completed. This is what the eval loop uses to compute success rates.
 5. **Add CI smoke test** — follow step 7 above to add a Dockerfile and CI job. This ensures the install stays green as dependencies evolve.
 ## Writing a benchmark doc page
@@ -320,7 +388,7 @@ Each benchmark `.mdx` page should include:
 - **Overview image or GIF.**
 - **Available tasks** — table of task suites with counts and brief descriptions.
 - **Installation** — `pip install -e ".[<benchmark>]"` plus any extra steps (env vars, system packages).
- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` and `batch_size` for reproducible results. Include single-task and multi-task examples if applicable.
+- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` for reproducible results. `batch_size` defaults to auto; only specify it if needed. Include single-task and multi-task examples if applicable. See the [Evaluation guide](evaluation) for details.
 - **Policy inputs and outputs** — observation keys with shapes, action space description.
 - **Recommended evaluation episodes** — how many episodes per task is standard.
 - **Training** — example `lerobot-train` command.
@@ -0,0 +1,162 @@
 # Evaluation
 `lerobot-eval` runs a trained policy on a simulation benchmark and reports success rate, reward, and (optionally) episode videos. It handles environment creation, batched rollouts, and metric aggregation automatically.
 ## Quick start
 Evaluate a Hub-hosted policy on LIBERO:
 ```bash
 lerobot-eval \
    --policy.path=pepijn223/smolvla_libero \
    --env.type=libero \
    --env.task=libero_spatial \
    --eval.n_episodes=10 \
    --policy.device=cuda
 ```
 Evaluate a local checkpoint:
 ```bash
 lerobot-eval \
    --policy.path=outputs/train/act_pusht/checkpoints/005000/pretrained_model \
    --env.type=pusht \
    --eval.n_episodes=10
 ```
 `batch_size` defaults to **auto** (based on CPU cores). The script picks the right number of parallel environments for your machine.
 ## Key flags
 | Flag                    | Default        | Description                                                                           |
 | ----------------------- | -------------- | ------------------------------------------------------------------------------------- |
 | `--policy.path`         | required       | Hub repo ID or local path to a pretrained model                                       |
 | `--env.type`            | required       | Benchmark name (`pusht`, `libero`, `metaworld`, etc.)                                 |
 | `--env.task`            | varies         | Task or suite name (e.g. `libero_spatial`, `libero_10`)                               |
 | `--eval.n_episodes`     | `50`           | Total episodes to run (across all tasks)                                              |
 | `--eval.batch_size`     | `0` (auto)     | Number of parallel environments. `0` = auto-tune from CPU cores                       |
 | `--eval.use_async_envs` | `true`         | Use `AsyncVectorEnv` (parallel stepping). Auto-downgrades to sync when `batch_size=1` |
 | `--policy.device`       | `cuda`         | Inference device                                                                      |
 | `--policy.use_amp`      | `false`        | Mixed-precision inference (saves VRAM, faster on Ampere+)                             |
 | `--seed`                | `1000`         | Random seed for reproducibility                                                       |
 | `--output_dir`          | auto-generated | Where to write results and videos                                                     |
 ### Environment-specific flags
 Some benchmarks accept additional flags through `--env.*`:
 ```bash
 # LIBERO: map simulator camera names to policy feature names
 --env.camera_name_mapping='{"agentview_image": "camera1", "robot0_eye_in_hand_image": "camera2"}'
 # Fill unused camera slots with zeros
 --policy.empty_cameras=1
 ```
 See each benchmark's documentation ([LIBERO](libero), [Meta-World](metaworld)) for benchmark-specific flags.
 ## How batch_size works
 `batch_size` controls how many environments run in parallel within a single `VectorEnv`:
 | `batch_size`  | Behavior                                                             |
 | ------------- | -------------------------------------------------------------------- |
 | `0` (default) | Auto-tune: `floor(cpu_cores × 0.7)`, capped by `n_episodes` and `64` |
 | `1`           | Single environment, synchronous. Useful for debugging                |
 | `N`           | N environments step in parallel via `AsyncVectorEnv`                 |
 When `batch_size > 1` and `use_async_envs=true`, each environment runs in its own subprocess via Gymnasium's `AsyncVectorEnv`. This parallelizes the simulation stepping (the main bottleneck), while the policy runs a single batched forward pass on GPU.
 **Example:** On a 16-core machine with `n_episodes=100`:
 - Auto batch_size = `floor(16 × 0.7)` = `11`
 - 11 environments step simultaneously → ~11× faster than sequential
 ## Performance
 ### AsyncVectorEnv (default)
 `AsyncVectorEnv` spawns one subprocess per environment. Each subprocess has its own simulator instance. While the policy computes actions on GPU, all environments step in parallel on CPU:
 ```
 GPU:  [inference]....[inference]....[inference]....
 CPU:  [step × N]....................[step × N]......
      ↑ parallel                   ↑ parallel
 ```
 For GPU-based simulators (LIBERO, Meta-World), the environments use **lazy initialization**: the GPU/EGL context is created inside the worker subprocess on first `reset()`, not in the parent process. This avoids `EGL_BAD_CONTEXT` crashes from inheriting stale GPU handles across `fork()`.
 ### Lazy task loading
 For multi-task benchmarks (e.g. LIBERO with 10 tasks), environments are wrapped in `_LazyAsyncVectorEnv` which defers worker creation until the task is actually evaluated. This keeps peak process count = `batch_size` instead of `n_tasks × batch_size`. After each task completes, workers are closed to free resources.
 ### Tuning for speed
 | Situation                      | Recommendation                                        |
 | ------------------------------ | ----------------------------------------------------- |
 | Slow eval, low GPU utilization | Increase `batch_size` (or leave at auto)              |
 | Out of memory (system RAM)     | Decrease `batch_size`                                 |
 | Out of GPU memory              | Decrease `batch_size`, or use `--policy.use_amp=true` |
 | Debugging / single-stepping    | `--eval.batch_size=1 --eval.use_async_envs=false`     |
 ## Output
 Results are written to `output_dir` (default: `outputs/eval/<date>/<time>_<job_name>/`):
 - `eval_info.json` — full metrics: per-episode, per-task, per-group, and overall aggregates
 - `videos/` — episode recordings (when `--eval.n_episodes_to_render > 0`)
 ### Metrics
 | Metric           | Description                                                          |
 | ---------------- | -------------------------------------------------------------------- |
 | `pc_success`     | Success rate (%). Based on `info["is_success"]` from the environment |
 | `avg_sum_reward` | Mean cumulative reward per episode                                   |
 | `avg_max_reward` | Mean peak reward per episode                                         |
 | `n_episodes`     | Total episodes evaluated                                             |
 | `eval_s`         | Total wall-clock time                                                |
 | `eval_ep_s`      | Mean wall-clock time per episode                                     |
 ## Multi-task evaluation
 For benchmarks with multiple tasks (LIBERO suites, Meta-World MT50), `lerobot-eval` automatically:
 1. Creates environments for all tasks in the selected suite(s)
 2. Evaluates each task sequentially (one task's workers at a time)
 3. Aggregates metrics per-task, per-group (suite), and overall
 ```bash
 # Evaluate all 10 tasks in libero_spatial
 lerobot-eval \
    --policy.path=pepijn223/smolvla_libero \
    --env.type=libero \
    --env.task=libero_spatial \
    --eval.n_episodes=10
 # Evaluate multiple suites
 lerobot-eval \
    --policy.path=pepijn223/smolvla_libero \
    --env.type=libero \
    --env.task="libero_spatial,libero_object" \
    --eval.n_episodes=10
 ```
 ## API usage
 You can call the eval functions directly from Python:
 ```python
 from lerobot.envs.factory import make_env
 from lerobot.policies.factory import make_policy
 from lerobot.scripts.lerobot_eval import eval_policy
 envs = make_env(env_cfg, n_envs=10)
 policy = make_policy(cfg=policy_cfg, env_cfg=env_cfg)
 metrics = eval_policy(
    env=envs["libero_spatial"][0],
    policy=policy,
    n_episodes=10,
 )
 print(metrics["pc_success"])
 ```
@@ -0,0 +1,89 @@
 #!/usr/bin/env python3
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Extract natural-language task descriptions for a benchmark suite.
 Runs inside the benchmark Docker container (where the env library is installed)
 immediately after lerobot-eval, writing a JSON file that parse_eval_metrics.py
 picks up and embeds in metrics.json.
 Output format: {"<suite>_<task_idx>": "<nl instruction>", ...}
 Usage:
    python scripts/ci/extract_task_descriptions.py \\
        --env libero --task libero_spatial \\
        --output /tmp/eval-artifacts/task_descriptions.json
 """
 from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
 def _libero_descriptions(task_suite: str) -> dict[str, str]:
    from libero.libero import benchmark  # type: ignore[import-untyped]
    suite_dict = benchmark.get_benchmark_dict()
    if task_suite not in suite_dict:
        print(
            f"[extract_task_descriptions] Unknown LIBERO suite '{task_suite}'. "
            f"Available: {list(suite_dict.keys())}",
            file=sys.stderr,
        )
        return {}
    suite = suite_dict[task_suite]()
    return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
 def _metaworld_descriptions(task_name: str) -> dict[str, str]:
    # MetaWorld tasks don't expose a separate NL description attribute;
    # use a cleaned version of the task name as the description.
    label = task_name.removeprefix("metaworld-").replace("-", " ").strip()
    return {f"{task_name}_0": label}
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
    parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
    parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
    args = parser.parse_args()
    descriptions: dict[str, str] = {}
    try:
        if args.env == "libero":
            descriptions = _libero_descriptions(args.task)
        elif args.env == "metaworld":
            descriptions = _metaworld_descriptions(args.task)
        else:
            print(
                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
                file=sys.stderr,
            )
    except Exception as exc:
        print(f"[extract_task_descriptions] Warning: {exc}", file=sys.stderr)
    out_path = Path(args.output)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(descriptions, indent=2))
    print(f"[extract_task_descriptions] {len(descriptions)} descriptions → {out_path}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,129 @@
 #!/usr/bin/env python3
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Parse lerobot-eval output into a small metrics.json artifact.
 Reads eval_info.json written by lerobot-eval --output_dir and extracts the
 key metrics needed by the health dashboard. Handles both single-task and
 multi-task eval output formats.
 Usage:
    python scripts/ci/parse_eval_metrics.py \\
        --artifacts-dir /tmp/libero-artifacts \\
        --env libero \\
        --task libero_spatial \\
        --policy pepijn223/smolvla_libero
 Writes <artifacts-dir>/metrics.json. The CI workflow then uploads this file
 as a GitHub Actions artifact named "<env>-metrics".
 """
 from __future__ import annotations
 import argparse
 import json
 import math
 import sys
 from pathlib import Path
 def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]:
    """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json.
    Handles two output shapes:
      - Single-task: {"aggregated": {"pc_success": 80.0, ...}}
      - Multi-task:  {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}}
    """
    for key in ("aggregated", "overall"):
        if key not in info:
            continue
        agg = info[key]
        pc = agg.get("pc_success")
        n = agg.get("n_episodes")
        reward = agg.get("avg_sum_reward")
        eval_s = agg.get("eval_s")
        if pc is not None and not math.isnan(pc):
            return (
                float(pc),
                int(n) if n is not None else None,
                float(reward) if reward is not None else None,
                float(eval_s) if eval_s is not None else None,
            )
    return None, None, None, None
 def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume")
    parser.add_argument("--env", required=True, help="Environment name (e.g. libero)")
    parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)")
    parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)")
    args = parser.parse_args()
    artifacts_dir = Path(args.artifacts_dir)
    eval_info_path = artifacts_dir / "eval_info.json"
    pc_success: float | None = None
    n_episodes: int | None = None
    avg_sum_reward: float | None = None
    eval_s: float | None = None
    if eval_info_path.exists():
        try:
            info = json.loads(eval_info_path.read_text())
            pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info)
        except (json.JSONDecodeError, KeyError, TypeError) as exc:
            print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr)
    else:
        print(
            f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.",
            file=sys.stderr,
        )
    task_descriptions: dict[str, str] = {}
    task_desc_path = artifacts_dir / "task_descriptions.json"
    if task_desc_path.exists():
        try:
            task_descriptions = json.loads(task_desc_path.read_text())
        except json.JSONDecodeError as exc:
            print(
                f"[parse_eval_metrics] Warning: could not parse task_descriptions.json: {exc}",
                file=sys.stderr,
            )
    metrics = {
        "env": args.env,
        "task": args.task,
        "policy": args.policy,
        "pc_success": pc_success,
        "n_episodes": n_episodes,
        "avg_sum_reward": avg_sum_reward,
        "eval_s": eval_s,
        "task_descriptions": task_descriptions,
    }
    out_path = artifacts_dir / "metrics.json"
    out_path.write_text(json.dumps(metrics, indent=2))
    print(f"[parse_eval_metrics] Written: {out_path}")
    print(json.dumps(metrics, indent=2))
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -65,20 +65,27 @@ class WandBConfig:
 class EvalConfig:
    n_episodes: int = 50
    # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
-    batch_size: int = 50
+    # Set to 0 for auto-tuning based on available CPU cores and n_episodes.
    batch_size: int = 0
    # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-    use_async_envs: bool = False
+    # Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1.
    use_async_envs: bool = True
    def __post_init__(self) -> None:
        if self.batch_size == 0:
            self.batch_size = self._auto_batch_size()
        if self.batch_size > self.n_episodes:
-            raise ValueError(
+            self.batch_size = self.n_episodes
-                "The eval batch size is greater than the number of eval episodes "
+
-                f"({self.batch_size} > {self.n_episodes}). As a result, {self.batch_size} "
+    def _auto_batch_size(self) -> int:
-                f"eval environments will be instantiated, but only {self.n_episodes} will be used. "
+        """Pick batch_size based on CPU cores, capped by n_episodes."""
-                "This might significantly slow down evaluation. To fix this, you should update your command "
+        import math
-                f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={self.batch_size}`), "
+        import os
-                f"or lower the batch size (e.g. `eval.batch_size={self.n_episodes}`)."
+
-            )
+        cpu_cores = os.cpu_count() or 4
        # Each async env worker needs ~1 core; leave headroom for main process + inference.
        by_cpu = max(1, math.floor(cpu_cores * 0.7))
        return min(by_cpu, self.n_episodes, 64)
@dataclass
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
 import abc
 import importlib
 from dataclasses import dataclass, field, fields
 from typing import Any
 import draccus
 import gymnasium as gym
 from gymnasium.envs.registration import registry as gym_registry
 from lerobot.configs.types import FeatureType, PolicyFeature
 from lerobot.robots import RobotConfig
@@ -39,6 +44,13 @@ from lerobot.utils.constants import (
 )
 def _make_vec_env_cls(use_async: bool, n_envs: int):
    """Return the right VectorEnv constructor."""
    if use_async and n_envs > 1:
        return gym.vector.AsyncVectorEnv
    return gym.vector.SyncVectorEnv
@dataclass
 class EnvConfig(draccus.ChoiceRegistry, abc.ABC):
    task: str | None = None
@@ -67,6 +79,50 @@ class EnvConfig(draccus.ChoiceRegistry, abc.ABC):
    def gym_kwargs(self) -> dict:
        raise NotImplementedError()
    def create_envs(
        self,
        n_envs: int,
        use_async_envs: bool = True,
    ) -> dict[str, dict[int, gym.vector.VectorEnv]]:
        """Create {suite: {task_id: VectorEnv}}.
        Default: single-task env via gym.make(). Multi-task benchmarks override.
        AsyncVectorEnv is the default for n_envs > 1; auto-downgraded to Sync for n_envs=1.
        """
        env_cls = gym.vector.AsyncVectorEnv if (use_async_envs and n_envs > 1) else gym.vector.SyncVectorEnv
        if self.gym_id not in gym_registry:
            print(f"gym id '{self.gym_id}' not found, attempting to import '{self.package_name}'...")
            try:
                importlib.import_module(self.package_name)
            except ModuleNotFoundError as e:
                raise ModuleNotFoundError(
                    f"Package '{self.package_name}' required for env '{self.type}' not found. "
                    f"Please install it or check PYTHONPATH."
                ) from e
            if self.gym_id not in gym_registry:
                raise gym.error.NameNotFound(
                    f"Environment '{self.gym_id}' not registered even after importing '{self.package_name}'."
                )
        def _make_one():
            return gym.make(self.gym_id, disable_env_checker=self.disable_env_checker, **self.gym_kwargs)
        try:
            from gymnasium.vector import AutoresetMode
            vec = env_cls([_make_one for _ in range(n_envs)], autoreset_mode=AutoresetMode.SAME_STEP)
        except ImportError:
            vec = env_cls([_make_one for _ in range(n_envs)])
        return {self.type: {0: vec}}
    def get_env_processors(self):
        """Return (preprocessor, postprocessor) for this env. Default: identity."""
        from lerobot.processor.pipeline import PolicyProcessorPipeline
        return PolicyProcessorPipeline(steps=[]), PolicyProcessorPipeline(steps=[])
@dataclass
 class HubEnvConfig(EnvConfig):
@@ -338,13 +394,51 @@ class LiberoEnv(EnvConfig):
        else:
            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
        if self.camera_name_mapping is not None:
            mapped_agentview = self.camera_name_mapping.get("agentview_image", "image")
            mapped_eye_in_hand = self.camera_name_mapping.get("robot0_eye_in_hand_image", "image2")
            self.features_map[LIBERO_KEY_PIXELS_AGENTVIEW] = f"{OBS_IMAGES}.{mapped_agentview}"
            self.features_map[LIBERO_KEY_PIXELS_EYE_IN_HAND] = f"{OBS_IMAGES}.{mapped_eye_in_hand}"
    @property
    def gym_kwargs(self) -> dict:
-        kwargs: dict[str, Any] = {"obs_type": self.obs_type, "render_mode": self.render_mode}
+        kwargs: dict[str, Any] = {
            "obs_type": self.obs_type,
            "render_mode": self.render_mode,
            "observation_height": self.observation_height,
            "observation_width": self.observation_width,
        }
        if self.task_ids is not None:
            kwargs["task_ids"] = self.task_ids
        return kwargs
    def create_envs(self, n_envs: int, use_async_envs: bool = True):
        from lerobot.envs.libero import create_libero_envs
        if self.task is None:
            raise ValueError("LiberoEnv requires a task to be specified")
        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
        return create_libero_envs(
            task=self.task,
            n_envs=n_envs,
            camera_name=self.camera_name,
            init_states=self.init_states,
            gym_kwargs=self.gym_kwargs,
            env_cls=env_cls,
            control_mode=self.control_mode,
            episode_length=self.episode_length,
            camera_name_mapping=self.camera_name_mapping,
        )
    def get_env_processors(self):
        from lerobot.processor.env_processor import LiberoProcessorStep
        from lerobot.processor.pipeline import PolicyProcessorPipeline
        return (
            PolicyProcessorPipeline(steps=[LiberoProcessorStep()]),
            PolicyProcessorPipeline(steps=[]),
        )
@EnvConfig.register_subclass("metaworld")
@dataclass
@@ -387,6 +481,19 @@ class MetaworldEnv(EnvConfig):
            "render_mode": self.render_mode,
        }
    def create_envs(self, n_envs: int, use_async_envs: bool = True):
        from lerobot.envs.metaworld import create_metaworld_envs
        if self.task is None:
            raise ValueError("MetaWorld requires a task to be specified")
        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
        return create_metaworld_envs(
            task=self.task,
            n_envs=n_envs,
            gym_kwargs=self.gym_kwargs,
            env_cls=env_cls,
        )
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
@@ -454,3 +561,18 @@ class IsaaclabArenaEnv(HubEnvConfig):
    @property
    def gym_kwargs(self) -> dict:
        return {}
    def get_env_processors(self):
        from lerobot.processor.env_processor import IsaaclabArenaProcessorStep
        from lerobot.processor.pipeline import PolicyProcessorPipeline
        state_keys = tuple(k.strip() for k in (self.state_keys or "").split(",") if k.strip())
        camera_keys = tuple(k.strip() for k in (self.camera_keys or "").split(",") if k.strip())
        if not state_keys and not camera_keys:
            raise ValueError("At least one of state_keys or camera_keys must be specified.")
        return (
            PolicyProcessorPipeline(
                steps=[IsaaclabArenaProcessorStep(state_keys=state_keys, camera_keys=camera_keys)]
            ),
            PolicyProcessorPipeline(steps=[]),
        )
@@ -13,96 +13,52 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import importlib
+from __future__ import annotations
 from typing import Any
 import gymnasium as gym
 from gymnasium.envs.registration import registry as gym_registry
-from lerobot.configs.policies import PreTrainedConfig
+from lerobot.envs.configs import EnvConfig, HubEnvConfig
 from lerobot.envs.configs import AlohaEnv, EnvConfig, HubEnvConfig, IsaaclabArenaEnv, LiberoEnv, PushtEnv
 from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result
 from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.processor import ProcessorStep
 from lerobot.processor.env_processor import IsaaclabArenaProcessorStep, LiberoProcessorStep
 from lerobot.processor.pipeline import PolicyProcessorPipeline
 def make_env_config(env_type: str, **kwargs) -> EnvConfig:
-    if env_type == "aloha":
+    try:
-        return AlohaEnv(**kwargs)
+        cls = EnvConfig.get_choice_class(env_type)
-    elif env_type == "pusht":
+    except KeyError as err:
-        return PushtEnv(**kwargs)
+        raise ValueError(
-    elif env_type == "libero":
+            f"Environment type '{env_type}' is not registered. "
-        return LiberoEnv(**kwargs)
+            f"Available: {list(EnvConfig.get_known_choices().keys())}"
-    else:
+        ) from err
-        raise ValueError(f"Policy type '{env_type}' is not available.")
+    return cls(**kwargs)
 def make_env_pre_post_processors(
    env_cfg: EnvConfig,
-    policy_cfg: PreTrainedConfig,
+    policy_cfg: Any,
-) -> tuple[
+) -> tuple[Any, Any]:
    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
 ]:
    """
    Create preprocessor and postprocessor pipelines for environment observations.
-    This function creates processor pipelines that transform raw environment
+    Returns a tuple of (preprocessor, postprocessor). By default, delegates to
-    observations and actions. By default, it returns identity processors that do nothing.
+    ``env_cfg.get_env_processors()``.  The XVLAConfig policy-specific override
-    For specific environments like LIBERO, it adds environment-specific processing steps.
+    stays here because it depends on the *policy* config, not the env config.
    Args:
        env_cfg: The configuration of the environment.
    Returns:
        A tuple containing:
            - preprocessor: Pipeline that processes environment observations
            - postprocessor: Pipeline that processes environment outputs (currently identity)
    """
-    # Preprocessor and Postprocessor steps are Identity for most environments
+    from lerobot.policies.xvla.configuration_xvla import XVLAConfig
-    preprocessor_steps: list[ProcessorStep] = []
+
    postprocessor_steps: list[ProcessorStep] = []
    if isinstance(policy_cfg, XVLAConfig):
        from lerobot.policies.xvla.processor_xvla import make_xvla_libero_pre_post_processors
        return make_xvla_libero_pre_post_processors()
-    # For LIBERO environments, add the LiberoProcessorStep to preprocessor
+    return env_cfg.get_env_processors()
    if isinstance(env_cfg, LiberoEnv) or "libero" in env_cfg.type:
        preprocessor_steps.append(LiberoProcessorStep())
    # For Isaaclab Arena environments, add the IsaaclabArenaProcessorStep
    if isinstance(env_cfg, IsaaclabArenaEnv) or "isaaclab_arena" in env_cfg.type:
        # Parse comma-separated keys (handle None for state-based policies)
        if env_cfg.state_keys:
            state_keys = tuple(k.strip() for k in env_cfg.state_keys.split(",") if k.strip())
        else:
            state_keys = ()
        if env_cfg.camera_keys:
            camera_keys = tuple(k.strip() for k in env_cfg.camera_keys.split(",") if k.strip())
        else:
            camera_keys = ()
        if not state_keys and not camera_keys:
            raise ValueError("At least one of state_keys or camera_keys must be specified.")
        preprocessor_steps.append(
            IsaaclabArenaProcessorStep(
                state_keys=state_keys,
                camera_keys=camera_keys,
            )
        )
    preprocessor = PolicyProcessorPipeline(steps=preprocessor_steps)
    postprocessor = PolicyProcessorPipeline(steps=postprocessor_steps)
    return preprocessor, postprocessor
 def make_env(
    cfg: EnvConfig | str,
    n_envs: int = 1,
-    use_async_envs: bool = False,
+    use_async_envs: bool = True,
    hub_cache_dir: str | None = None,
    trust_remote_code: bool = False,
 ) -> dict[str, dict[int, gym.vector.VectorEnv]]:
@@ -163,57 +119,4 @@ def make_env(
    if n_envs < 1:
        raise ValueError("`n_envs` must be at least 1")
-    env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
+    return cfg.create_envs(n_envs=n_envs, use_async_envs=use_async_envs)
    if "libero" in cfg.type:
        from lerobot.envs.libero import create_libero_envs
        if cfg.task is None:
            raise ValueError("LiberoEnv requires a task to be specified")
        return create_libero_envs(
            task=cfg.task,
            n_envs=n_envs,
            camera_name=cfg.camera_name,
            init_states=cfg.init_states,
            gym_kwargs=cfg.gym_kwargs,
            env_cls=env_cls,
            control_mode=cfg.control_mode,
            episode_length=cfg.episode_length,
        )
    elif "metaworld" in cfg.type:
        from lerobot.envs.metaworld import create_metaworld_envs
        if cfg.task is None:
            raise ValueError("MetaWorld requires a task to be specified")
        return create_metaworld_envs(
            task=cfg.task,
            n_envs=n_envs,
            gym_kwargs=cfg.gym_kwargs,
            env_cls=env_cls,
        )
    if cfg.gym_id not in gym_registry:
        print(f"gym id '{cfg.gym_id}' not found, attempting to import '{cfg.package_name}'...")
        try:
            importlib.import_module(cfg.package_name)
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError(
                f"Package '{cfg.package_name}' required for env '{cfg.type}' not found. "
                f"Please install it or check PYTHONPATH."
            ) from e
        if cfg.gym_id not in gym_registry:
            raise gym.error.NameNotFound(
                f"Environment '{cfg.gym_id}' not registered even after importing '{cfg.package_name}'."
            )
    def _make_one():
        return gym.make(cfg.gym_id, disable_env_checker=cfg.disable_env_checker, **(cfg.gym_kwargs or {}))
    vec = env_cls([_make_one for _ in range(n_envs)], autoreset_mode=gym.vector.AutoresetMode.SAME_STEP)
    # normalize to {suite: {task_id: vec_env}} for consistency
    suite_name = cfg.type  # e.g., "pusht", "aloha"
    return {suite_name: {0: vec}}
@@ -29,6 +29,7 @@ from gymnasium import spaces
 from libero.libero import benchmark, get_libero_path
 from libero.libero.envs import OffScreenRenderEnv
 from lerobot.envs.utils import _LazyAsyncVectorEnv
 from lerobot.types import RobotObservation
@@ -150,7 +151,17 @@ class LiberoEnv(gym.Env):
        self.init_state_id = self.episode_index  # tie each sub-env to a fixed init state
-        self._env = self._make_envs_task(task_suite, self.task_id)
+        # Extract task metadata without allocating GPU resources (safe before fork).
        task = task_suite.get_task(task_id)
        self.task = task.name
        self.task_description = task.language
        self._task_bddl_file = os.path.join(
            get_libero_path("bddl_files"), task.problem_folder, task.bddl_file
        )
        self._env: OffScreenRenderEnv | None = (
            None  # deferred — created on first reset() inside the worker subprocess
        )
        default_steps = 500
        self._max_episode_steps = (
            TASK_SUITE_MAX_STEPS.get(task_suite_name, default_steps)
@@ -221,28 +232,33 @@ class LiberoEnv(gym.Env):
            low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
        )
    def _ensure_env(self) -> None:
        """Create the underlying OffScreenRenderEnv on first use.
        Called inside the worker subprocess after fork(), so each worker gets
        its own clean EGL context rather than inheriting a stale one from the
        parent process (which causes EGL_BAD_CONTEXT crashes with AsyncVectorEnv).
        """
        if self._env is not None:
            return
        env = OffScreenRenderEnv(
            bddl_file_name=self._task_bddl_file,
            camera_heights=self.observation_height,
            camera_widths=self.observation_width,
        )
        env.reset()
        self._env = env
    def render(self):
        self._ensure_env()
        raw_obs = self._env.env._get_observations()
-        image = self._format_raw_obs(raw_obs)["pixels"]["image"]
+        pixels = self._format_raw_obs(raw_obs)["pixels"]
        image = next(iter(pixels.values()))
        image = image[::-1, ::-1]  # flip both H and W for visualization
        return image
    def _make_envs_task(self, task_suite: Any, task_id: int = 0):
        task = task_suite.get_task(task_id)
        self.task = task.name
        self.task_description = task.language
        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
        env_args = {
            "bddl_file_name": task_bddl_file,
            "camera_heights": self.observation_height,
            "camera_widths": self.observation_width,
        }
        env = OffScreenRenderEnv(**env_args)
        env.reset()
        return env
    def _format_raw_obs(self, raw_obs: RobotObservation) -> RobotObservation:
        assert self._env is not None, "_format_raw_obs called before _ensure_env()"
        images = {}
        for camera_name in self.camera_name:
            image = raw_obs[camera_name]
@@ -294,6 +310,7 @@ class LiberoEnv(gym.Env):
        )
    def reset(self, seed=None, **kwargs):
        self._ensure_env()
        super().reset(seed=seed)
        self._env.seed(seed)
        raw_obs = self._env.reset()
@@ -320,6 +337,8 @@ class LiberoEnv(gym.Env):
        return observation, info
    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
        self._ensure_env()
        assert self._env is not None
        if action.ndim != 1:
            raise ValueError(
                f"Expected action to be 1-D (shape (action_dim,)), "
@@ -339,18 +358,13 @@ class LiberoEnv(gym.Env):
        )
        observation = self._format_raw_obs(raw_obs)
        if terminated:
            info["final_info"] = {
                "task": self.task,
                "task_id": self.task_id,
                "done": bool(done),
                "is_success": bool(is_success),
            }
            self.reset()
        truncated = False
        return observation, reward, terminated, truncated, info
    def close(self):
-        self._env.close()
+        if self._env is not None:
            self._env.close()
 def _make_env_fns(
@@ -364,6 +378,7 @@ def _make_env_fns(
    init_states: bool,
    gym_kwargs: Mapping[str, Any],
    control_mode: str,
    camera_name_mapping: dict[str, str] | None = None,
 ) -> list[Callable[[], LiberoEnv]]:
    """Build n_envs factory callables for a single (suite, task_id)."""
@@ -379,6 +394,7 @@ def _make_env_fns(
            episode_index=episode_index,
            n_envs=n_envs,
            control_mode=control_mode,
            camera_name_mapping=camera_name_mapping,
            **local_kwargs,
        )
@@ -400,6 +416,7 @@ def create_libero_envs(
    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
    control_mode: str = "relative",
    episode_length: int | None = None,
    camera_name_mapping: dict[str, str] | None = None,
 ) -> dict[str, dict[int, Any]]:
    """
    Create vectorized LIBERO environments with a consistent return shape.
@@ -430,6 +447,8 @@ def create_libero_envs(
    if task_ids_filter is not None:
        print(f"Restricting to task_ids={task_ids_filter}")
    is_async = env_cls is gym.vector.AsyncVectorEnv
    out: dict[str, dict[int, Any]] = defaultdict(dict)
    for suite_name in suite_names:
        suite = _get_suite(suite_name)
@@ -438,6 +457,11 @@ def create_libero_envs(
        if not selected:
            raise ValueError(f"No tasks selected for suite '{suite_name}' (available: {total}).")
        # All tasks in a suite share identical observation/action spaces.
        # Probe once and reuse to avoid creating a temp env per task.
        cached_obs_space: spaces.Space | None = None
        cached_act_space: spaces.Space | None = None
        for tid in selected:
            fns = _make_env_fns(
                suite=suite,
@@ -449,9 +473,16 @@ def create_libero_envs(
                init_states=init_states,
                gym_kwargs=gym_kwargs,
                control_mode=control_mode,
                camera_name_mapping=camera_name_mapping,
            )
-            out[suite_name][tid] = env_cls(fns)
+            if is_async:
                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
                out[suite_name][tid] = lazy
            else:
                out[suite_name][tid] = env_cls(fns)
            print(f"Built vec env | suite={suite_name} | task_id={tid} | n_envs={n_envs}")
    # return plain dicts for predictability
    return {suite: dict(task_map) for suite, task_map in out.items()}
@@ -25,6 +25,7 @@ import metaworld.policies as policies
 import numpy as np
 from gymnasium import spaces
 from lerobot.envs.utils import _LazyAsyncVectorEnv
 from lerobot.types import RobotObservation
 # ---- Load configuration data from the external JSON file ----
@@ -97,8 +98,9 @@ class MetaworldEnv(gym.Env):
        self.visualization_height = visualization_height
        self.camera_name = camera_name
-        self._env = self._make_envs_task(self.task)
+        self._env_name = self.task  # already stripped of "metaworld-" prefix above
-        self._max_episode_steps = self._env.max_path_length
+        self._env = None  # deferred — created on first reset() inside the worker subprocess
        self._max_episode_steps = 500  # MT1 environments always have max_path_length=500
        self.task_description = TASK_DESCRIPTIONS[self.task]
        self.expert_policy = TASK_POLICY_MAPPING[self.task]()
@@ -136,6 +138,24 @@ class MetaworldEnv(gym.Env):
        self.action_space = spaces.Box(low=-1, high=1, shape=(ACTION_DIM,), dtype=np.float32)
    def _ensure_env(self) -> None:
        """Create the underlying MetaWorld env on first use.
        Called inside the worker subprocess after fork(), so each worker gets
        its own clean rendering context rather than inheriting a stale one from
        the parent process (which causes crashes with AsyncVectorEnv).
        """
        if self._env is not None:
            return
        mt1 = metaworld.MT1(self._env_name, seed=42)
        env = mt1.train_classes[self._env_name](render_mode="rgb_array", camera_name=self.camera_name)
        env.set_task(mt1.train_tasks[0])
        if self.camera_name == "corner2":
            env.model.cam_pos[2] = [0.75, 0.075, 0.7]
        env.reset()
        env._freeze_rand_vec = False  # otherwise no randomization
        self._env = env
    def render(self) -> np.ndarray:
        """
        Render the current environment frame.
@@ -143,26 +163,13 @@ class MetaworldEnv(gym.Env):
        Returns:
            np.ndarray: The rendered RGB image from the environment.
        """
        self._ensure_env()
        image = self._env.render()
        if self.camera_name == "corner2":
            # Images from this camera are flipped — correct them
            image = np.flip(image, (0, 1))
        return image
    def _make_envs_task(self, env_name: str):
        mt1 = metaworld.MT1(env_name, seed=42)
        env = mt1.train_classes[env_name](render_mode="rgb_array", camera_name=self.camera_name)
        env.set_task(mt1.train_tasks[0])
        if self.camera_name == "corner2":
            env.model.cam_pos[2] = [
                0.75,
                0.075,
                0.7,
            ]  # corner2 position, similar to https://arxiv.org/pdf/2206.14244
        env.reset()
        env._freeze_rand_vec = False  # otherwise no randomization
        return env
    def _format_raw_obs(self, raw_obs: np.ndarray) -> RobotObservation:
        image = None
        if self._env is not None:
@@ -209,6 +216,7 @@ class MetaworldEnv(gym.Env):
            observation (RobotObservation): The initial formatted observation.
            info (Dict[str, Any]): Additional info about the reset state.
        """
        self._ensure_env()
        super().reset(seed=seed)
        raw_obs, info = self._env.reset(seed=seed)
@@ -232,6 +240,7 @@ class MetaworldEnv(gym.Env):
            truncated (bool): Whether the episode was truncated due to a time limit.
            info (Dict[str, Any]): Additional environment info.
        """
        self._ensure_env()
        if action.ndim != 1:
            raise ValueError(
                f"Expected action to be 1-D (shape (action_dim,)), "
@@ -263,7 +272,8 @@ class MetaworldEnv(gym.Env):
        return observation, reward, terminated, truncated, info
    def close(self):
-        self._env.close()
+        if self._env is not None:
            self._env.close()
 # ---- Main API ----------------------------------------------------------------
@@ -297,6 +307,9 @@ def create_metaworld_envs(
    print(f"Creating Meta-World envs | task_groups={task_groups} | n_envs(per task)={n_envs}")
    is_async = env_cls is gym.vector.AsyncVectorEnv
    cached_obs_space = None
    cached_act_space = None
    out: dict[str, dict[int, Any]] = defaultdict(dict)
    for group in task_groups:
@@ -309,7 +322,14 @@ def create_metaworld_envs(
            # build n_envs factories
            fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
-            out[group][tid] = env_cls(fns)
+            if is_async:
                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
                out[group][tid] = lazy
            else:
                out[group][tid] = env_cls(fns)
    # return a plain dict for consistency
    return {group: dict(task_map) for group, task_map in out.items()}
@@ -16,7 +16,7 @@
 import importlib.util
 import os
 import warnings
-from collections.abc import Mapping, Sequence
+from collections.abc import Callable, Mapping, Sequence
 from functools import singledispatch
 from typing import Any
@@ -130,56 +130,99 @@ def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
    return policy_features
-def are_all_envs_same_type(env: gym.vector.VectorEnv) -> bool:
+def _sub_env_has_attr(env: gym.vector.VectorEnv, attr: str) -> bool:
-    first_type = type(env.envs[0])  # Get type of first env
+    try:
-    return all(type(e) is first_type for e in env.envs)  # Fast type check
+        env.get_attr(attr)
        return True
    except (AttributeError, Exception):
        return False
 class _LazyAsyncVectorEnv:
    """Defers AsyncVectorEnv creation until first use.
    Creating all tasks' AsyncVectorEnvs upfront spawns N_tasks × n_envs worker
    processes, all of which allocate EGL/GPU resources immediately. Since tasks
    are evaluated sequentially, only one task's workers need to be alive at a
    time. This wrapper stores the factory functions and creates the real
    AsyncVectorEnv on first reset()/step()/call(), keeping peak process count = n_envs.
    """
    def __init__(
        self,
        env_fns: list[Callable],
        observation_space=None,
        action_space=None,
    ):
        self._env_fns = env_fns
        self._env: gym.vector.AsyncVectorEnv | None = None
        self.num_envs = len(env_fns)
        if observation_space is not None and action_space is not None:
            self.observation_space = observation_space
            self.action_space = action_space
        else:
            tmp = env_fns[0]()
            self.observation_space = tmp.observation_space
            self.action_space = tmp.action_space
            tmp.close()
        self.single_observation_space = self.observation_space
        self.single_action_space = self.action_space
    def _ensure(self) -> None:
        if self._env is None:
            self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)
    def reset(self, **kwargs):
        self._ensure()
        return self._env.reset(**kwargs)
    def step(self, actions):
        self._ensure()
        return self._env.step(actions)
    def call(self, name, *args, **kwargs):
        self._ensure()
        return self._env.call(name, *args, **kwargs)
    def get_attr(self, name):
        self._ensure()
        return self._env.get_attr(name)
    def close(self) -> None:
        if self._env is not None:
            self._env.close()
            self._env = None
 def check_env_attributes_and_types(env: gym.vector.VectorEnv) -> None:
    with warnings.catch_warnings():
-        warnings.simplefilter("once", UserWarning)  # Apply filter only in this function
+        warnings.simplefilter("once", UserWarning)
-        if not (hasattr(env.envs[0], "task_description") and hasattr(env.envs[0], "task")):
+        if not (_sub_env_has_attr(env, "task_description") and _sub_env_has_attr(env, "task")):
            warnings.warn(
                "The environment does not have 'task_description' and 'task'. Some policies require these features.",
                UserWarning,
                stacklevel=2,
            )
        if not are_all_envs_same_type(env):
            warnings.warn(
                "The environments have different types. Make sure you infer the right task from each environment. Empty task will be passed instead.",
                UserWarning,
                stacklevel=2,
            )
 def add_envs_task(env: gym.vector.VectorEnv, observation: RobotObservation) -> RobotObservation:
    """Adds task feature to the observation dict with respect to the first environment attribute."""
-    if hasattr(env.envs[0], "task_description"):
+    if _sub_env_has_attr(env, "task_description"):
-        task_result = env.call("task_description")
+        task_result = list(env.call("task_description"))
        if isinstance(task_result, tuple):
            task_result = list(task_result)
        if not isinstance(task_result, list):
            raise TypeError(f"Expected task_description to return a list, got {type(task_result)}")
        if not all(isinstance(item, str) for item in task_result):
            raise TypeError("All items in task_description result must be strings")
        observation["task"] = task_result
-    elif hasattr(env.envs[0], "task"):
+    elif _sub_env_has_attr(env, "task"):
-        task_result = env.call("task")
+        task_result = list(env.call("task"))
        if isinstance(task_result, tuple):
            task_result = list(task_result)
        if not isinstance(task_result, list):
            raise TypeError(f"Expected task to return a list, got {type(task_result)}")
        if not all(isinstance(item, str) for item in task_result):
            raise TypeError("All items in task result must be strings")
        observation["task"] = task_result
-    else:  #  For envs without language instructions, e.g. aloha transfer cube and etc.
+    else:
        num_envs = observation[list(observation.keys())[0]].shape[0]
        observation["task"] = ["" for _ in range(num_envs)]
    return observation
@@ -136,8 +136,8 @@ class TokenizerProcessorStep(ObservationProcessorStep):
        # Standardize to a list of strings for the tokenizer
        if isinstance(task, str):
            return [task]
-        elif isinstance(task, list) and all(isinstance(t, str) for t in task):
+        elif isinstance(task, (list, tuple)) and all(isinstance(t, str) for t in task):
-            return task
+            return list(task)
        return None
@@ -73,7 +73,6 @@ from lerobot.configs import parser
 from lerobot.configs.eval import EvalPipelineConfig
 from lerobot.envs.factory import make_env, make_env_pre_post_processors
 from lerobot.envs.utils import (
    add_envs_task,
    check_env_attributes_and_types,
    close_envs,
    preprocess_observation,
@@ -166,9 +165,15 @@ def rollout(
        if return_observations:
            all_observations.append(deepcopy(observation))
-        # Infer "task" from attributes of environments.
+        # Infer "task" from sub-environments (prefer natural language description).
-        # TODO: works with SyncVectorEnv but not AsyncVectorEnv
+        # env.call() works with both SyncVectorEnv and AsyncVectorEnv.
-        observation = add_envs_task(env, observation)
+        try:
            observation["task"] = list(env.call("task_description"))
        except Exception:
            try:
                observation["task"] = list(env.call("task"))
            except Exception:
                observation["task"] = [""] * env.num_envs
        # Apply environment-specific preprocessing (e.g., LiberoProcessorStep for LIBERO)
        observation = env_preprocessor(observation)
@@ -201,6 +206,11 @@ def rollout(
                    "You're likely using an older version of gymnasium (< 1.0). Please upgrade."
                )
            successes = final_info["is_success"].tolist()
        elif "is_success" in info:
            is_success = info["is_success"]
            successes = (
                is_success.tolist() if hasattr(is_success, "tolist") else [bool(is_success)] * env.num_envs
            )
        else:
            successes = [False] * env.num_envs
@@ -313,8 +323,9 @@ def eval_policy(
        n_to_render_now = min(max_episodes_rendered - n_episodes_rendered, env.num_envs)
        if isinstance(env, gym.vector.SyncVectorEnv):
            ep_frames.append(np.stack([env.envs[i].render() for i in range(n_to_render_now)]))  # noqa: B023
-        elif isinstance(env, gym.vector.AsyncVectorEnv):
+        elif hasattr(env, "call"):
            # Here we must render all frames and discard any we don't need.
            # Covers AsyncVectorEnv and _LazyAsyncVectorEnv (which wraps one).
            ep_frames.append(np.stack(env.call("render")[:n_to_render_now]))
    if max_episodes_rendered > 0:
@@ -516,7 +527,7 @@ def eval_main(cfg: EvalPipelineConfig):
    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
-    logging.info("Making environment.")
+    logging.info(f"Making environment (batch_size={cfg.eval.batch_size}, async={cfg.eval.use_async_envs}).")
    envs = make_env(
        cfg.env,
        n_envs=cfg.eval.batch_size,
@@ -750,23 +761,39 @@ def eval_policy_all(
    )
    if max_parallel_tasks <= 1:
-        # sequential path (single accumulator path on the main thread)
+        prefetch_thread: threading.Thread | None = None
-        # NOTE: keeping a single-threaded accumulator avoids concurrent list appends or locks
+        for i, (task_group, task_id, env) in enumerate(tasks):
-        for task_group, task_id, env in tasks:
+            if prefetch_thread is not None:
-            tg, tid, metrics = task_runner(task_group, task_id, env)
+                prefetch_thread.join()
-            _accumulate_to(tg, metrics)
+                prefetch_thread = None
-            per_task_infos.append({"task_group": tg, "task_id": tid, "metrics": metrics})
+
            try:
                tg, tid, metrics = task_runner(task_group, task_id, env)
                _accumulate_to(tg, metrics)
                per_task_infos.append({"task_group": tg, "task_id": tid, "metrics": metrics})
            finally:
                env.close()
                # Prefetch next task's workers *after* closing current env to prevent
                # GPU memory overlap between consecutive tasks.
                if i + 1 < len(tasks):
                    next_env = tasks[i + 1][2]
                    if hasattr(next_env, "_ensure"):
                        prefetch_thread = threading.Thread(target=next_env._ensure, daemon=True)
                        prefetch_thread.start()
    else:
        # threaded path: submit all tasks, consume completions on main thread and accumulate there
        with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor:
            fut2meta = {}
            for task_group, task_id, env in tasks:
                fut = executor.submit(task_runner, task_group, task_id, env)
-                fut2meta[fut] = (task_group, task_id)
+                fut2meta[fut] = (task_group, task_id, env)
            for fut in cf.as_completed(fut2meta):
-                tg, tid, metrics = fut.result()
+                tg, tid, env = fut2meta[fut]
-                _accumulate_to(tg, metrics)
+                try:
-                per_task_infos.append({"task_group": tg, "task_id": tid, "metrics": metrics})
+                    tg, tid, metrics = fut.result()
                    _accumulate_to(tg, metrics)
                    per_task_infos.append({"task_group": tg, "task_id": tid, "metrics": metrics})
                finally:
                    env.close()
    # compute aggregated metrics helper (robust to lists/scalars)
    def _agg_from_list(xs):
@@ -0,0 +1,143 @@
 """Tests for the benchmark dispatch refactor (create_envs / get_env_processors on EnvConfig)."""
 from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 import gymnasium as gym
 import pytest
 from gymnasium.envs.registration import register, registry as gym_registry
 from lerobot.configs.types import PolicyFeature
 from lerobot.envs.configs import EnvConfig
 from lerobot.envs.factory import make_env, make_env_config, make_env_pre_post_processors
 logger = logging.getLogger(__name__)
 def test_registry_all_types():
    """make_env_config should resolve every registered EnvConfig subclass via the registry."""
    known = list(EnvConfig.get_known_choices().keys())
    assert len(known) >= 6
    for t in known:
        cfg = make_env_config(t)
        if not isinstance(cfg, EnvConfig):
            continue
        assert cfg.type == t
 def test_unknown_type():
    with pytest.raises(ValueError, match="not registered"):
        make_env_config("nonexistent")
 def test_identity_processors():
    """Base class get_env_processors() returns identity pipelines."""
    cfg = make_env_config("aloha")
    pre, post = cfg.get_env_processors()
    assert len(pre.steps) == 0 and len(post.steps) == 0
 def test_delegation():
    """make_env() should call cfg.create_envs(), not use if/elif dispatch."""
    sentinel = {"delegated": {0: "marker"}}
    fake = type(
        "Fake",
        (),
        {
            "hub_path": None,
            "create_envs": lambda self, n_envs, use_async_envs=False: sentinel,
        },
    )()
    result = make_env(fake, n_envs=1)
    assert result is sentinel
 def test_processors_delegation():
    """make_env_pre_post_processors delegates to cfg.get_env_processors()."""
    cfg = make_env_config("aloha")
    pre, post = make_env_pre_post_processors(cfg, policy_cfg=None)
    assert len(pre.steps) == 0
 def test_base_create_envs():
    """Base class create_envs() should build a single-task VectorEnv via gym.make()."""
    gym_id = "_dispatch_test/CartPole-v99"
    if gym_id not in gym_registry:
        register(id=gym_id, entry_point="gymnasium.envs.classic_control:CartPoleEnv")
    @EnvConfig.register_subclass("_dispatch_base_test")
    @dataclass
    class _Env(EnvConfig):
        task: str = "CartPole-v99"
        fps: int = 10
        features: dict[str, PolicyFeature] = field(default_factory=dict)
        @property
        def package_name(self):
            return "_dispatch_test"
        @property
        def gym_id(self):
            return gym_id
        @property
        def gym_kwargs(self):
            return {}
    try:
        envs = _Env().create_envs(n_envs=2)
        assert "_dispatch_base_test" in envs
        env = envs["_dispatch_base_test"][0]
        assert isinstance(env, gym.vector.VectorEnv)
        assert env.num_envs == 2
        env.close()
    finally:
        if gym_id in gym_registry:
            del gym_registry[gym_id]
 def test_custom_create_envs_override():
    """A custom EnvConfig subclass can override create_envs()."""
    mock_vec = gym.vector.SyncVectorEnv([lambda: gym.make("CartPole-v1")])
    @EnvConfig.register_subclass("_dispatch_custom_test")
    @dataclass
    class _Env(EnvConfig):
        task: str = "x"
        features: dict[str, PolicyFeature] = field(default_factory=dict)
        @property
        def gym_kwargs(self):
            return {}
        def create_envs(self, n_envs, use_async_envs=False):
            return {"custom_suite": {0: mock_vec}}
    try:
        result = make_env(_Env(), n_envs=1)
        assert "custom_suite" in result
    finally:
        mock_vec.close()
 def test_custom_get_env_processors_override():
    """A custom EnvConfig subclass can override get_env_processors()."""
    from lerobot.processor.pipeline import DataProcessorPipeline
    @EnvConfig.register_subclass("_dispatch_proc_test")
    @dataclass
    class _Env(EnvConfig):
        task: str = "x"
        features: dict[str, PolicyFeature] = field(default_factory=dict)
        @property
        def gym_kwargs(self):
            return {}
        def get_env_processors(self):
            return DataProcessorPipeline(steps=[]), DataProcessorPipeline(steps=[])
    pre, post = _Env().get_env_processors()
    assert isinstance(pre, DataProcessorPipeline)
@@ -189,6 +189,30 @@ def test_list_of_strings_tokenization(mock_auto_tokenizer):
    assert attention_mask.shape == (2, 8)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
 def test_tuple_of_strings_tokenization(mock_auto_tokenizer):
    """Test tokenization of a tuple of strings (returned by VectorEnv.call())."""
    mock_tokenizer = MockTokenizer(vocab_size=100)
    mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
    processor = TokenizerProcessorStep(tokenizer_name="test-tokenizer", max_length=8)
    transition = create_transition(
        observation={"state": torch.tensor([1.0, 2.0])},
        action=torch.tensor([0.1, 0.2]),
        complementary_data={"task": ("pick up cube", "place on table")},
    )
    result = processor(transition)
    observation = result[TransitionKey.OBSERVATION]
    tokens = observation[f"{OBS_LANGUAGE}.tokens"]
    attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
    assert tokens.shape == (2, 8)
    assert attention_mask.shape == (2, 8)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
 def test_custom_keys(mock_auto_tokenizer):