diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
index 3a5d92c44..b07c8f8da 100644
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -843,3 +843,103 @@ jobs:
name: libero-plus-metrics
path: /tmp/libero-plus-artifacts/metrics.json
if-no-files-found: warn
+
+ # ── VLABENCH ─────────────────────────────────────────────────────────────
+ # Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain)
+ vlabench-integration-test:
+ name: VLABench — build image + 1-episode eval
+ runs-on:
+ group: aws-g6-4xlarge-plus
+ env:
+ HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ lfs: true
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+ with:
+ cache-binary: false
+
+ - name: Login to Docker Hub
+ if: ${{ env.DOCKERHUB_USERNAME != '' }}
+ uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+ with:
+ username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+ env:
+ DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+ - name: Build VLABench benchmark image
+ uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+ with:
+ context: .
+ file: docker/Dockerfile.benchmark.vlabench
+ push: false
+ load: true
+ tags: lerobot-benchmark-vlabench:ci
+ build-args: |
+ VLABENCH_ASSETS_REPO=lerobot/vlabench-assets
+
+ - name: Run VLABench smoke eval (10 tasks, 1 episode each)
+ if: env.HF_USER_TOKEN != ''
+ run: |
+ docker run --name vlabench-eval --gpus all \
+ --shm-size=4g \
+ -e HF_HOME=/tmp/hf \
+ -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+ -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+ -e MUJOCO_GL=egl \
+ lerobot-benchmark-vlabench:ci \
+ bash -c "
+ hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+ lerobot-eval \
+ --policy.path=lerobot/smolvla_vlabench \
+ --env.type=vlabench \
+ --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+ --eval.batch_size=1 \
+ --eval.n_episodes=1 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \
+ --output_dir=/tmp/eval-artifacts
+ python scripts/ci/extract_task_descriptions.py \
+ --env vlabench \
+ --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+ --output /tmp/eval-artifacts/task_descriptions.json
+ "
+
+ - name: Copy VLABench artifacts from container
+ if: always()
+ run: |
+ mkdir -p /tmp/vlabench-artifacts
+ docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true
+ docker rm -f vlabench-eval || true
+
+ - name: Parse VLABench eval metrics
+ if: always()
+ run: |
+ python3 scripts/ci/parse_eval_metrics.py \
+ --artifacts-dir /tmp/vlabench-artifacts \
+ --env vlabench \
+ --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+ --policy lerobot/smolvla_vlabench
+
+ - name: Upload VLABench rollout video
+ if: always()
+ uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+ with:
+ name: vlabench-rollout-video
+ path: /tmp/vlabench-artifacts/videos/
+ if-no-files-found: warn
+
+ - name: Upload VLABench eval metrics
+ if: always()
+ uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+ with:
+ name: vlabench-metrics
+ path: /tmp/vlabench-artifacts/metrics.json
+ if-no-files-found: warn
diff --git a/docker/Dockerfile.benchmark.vlabench b/docker/Dockerfile.benchmark.vlabench
new file mode 100644
index 000000000..13502a3e3
--- /dev/null
+++ b/docker/Dockerfile.benchmark.vlabench
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for VLABench integration tests.
+# Extends the nightly GPU image with the PR's source code and VLABench setup.
+#
+# Build: docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench .
+# Run: docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control.
+# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI).
+# Editable install (-e) because VLABench/utils/ has no __init__.py, so
+# find_packages() omits it from wheels; editable mode uses the source tree directly.
+# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py).
+# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import
+# time. Guard the call so missing dirs return [] instead of crashing (in case
+# the asset download is partial).
+#
+# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need
+# an upstream fix; don't rely on `main`/`develop` drift.
+ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017
+ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85
+RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \
+ git -C ~/VLABench checkout ${VLABENCH_SHA} && \
+ git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \
+ git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \
+ python3 -c "\
+import pathlib; \
+p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \
+t = p.read_text(); \
+p.write_text(t.replace( \
+ 'subdirs = os.listdir(xml_dir)', \
+ 'if not os.path.isdir(xml_dir): return []\n subdirs = os.listdir(xml_dir)'))" && \
+ uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \
+ mujoco==3.2.2 dm-control==1.0.22 \
+ open3d colorlog scikit-learn openai gdown
+
+# Download VLABench mesh assets. Task configs reference object meshes
+# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without
+# them the task builder picks from an empty mesh list and crashes with
+# IndexError at task-build time (random.choice([]) in config_manager.py).
+#
+# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time
+# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll
+# snapshot_download the repo into VLABench's assets dir. This is the reliable
+# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have
+# viewed or downloaded this file recently") on shared academic files.
+#
+# After download we *validate* that at least one XML exists under each
+# task-critical subtree and fail the build loudly if not. Silent-empty asset
+# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them
+# here rather than after a 10-minute eval build.
+#
+# Fallback: VLABench's own gdown-based script. Best-effort only.
+ARG VLABENCH_ASSETS_REPO=""
+RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \
+ if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \
+ echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \
+ uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \
+ python -c "from huggingface_hub import snapshot_download; \
+p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \
+ local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \
+print('snapshot_download returned:', p)"; \
+ else \
+ echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \
+ python ~/VLABench/scripts/download_assets.py --choice all; \
+ fi && \
+ python -c "\
+from pathlib import Path; \
+import sys; \
+root = Path('${ASSETS_DIR}'); \
+checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \
+failed = []; \
+print(f'Validating VLABench assets under {root}'); \
+[print(f' {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \
+[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \
+sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')"
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with VLABenchEnv registration
+# and updated obs handling) replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index d29f4c545..f5e1129f3 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -91,6 +91,8 @@
title: RoboMME
- local: envhub_isaaclab_arena
title: NVIDIA IsaacLab Arena Environments
+ - local: vlabench
+ title: VLABench
title: "Benchmarks"
- sections:
- local: introduction_processors
diff --git a/docs/source/vlabench.mdx b/docs/source/vlabench.mdx
new file mode 100644
index 000000000..da579d674
--- /dev/null
+++ b/docs/source/vlabench.mdx
@@ -0,0 +1,176 @@
+# VLABench
+
+[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below).
+
+- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194)
+- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench)
+- Project website: [vlabench.github.io](https://vlabench.github.io)
+- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench)
+
+
+
+## Available tasks
+
+VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface:
+
+| Suite | CLI name | Tasks | Description |
+| --------- | ----------- | ----- | ---------------------------------------------------------------- |
+| Primitive | `primitive` | 21 | Single / few-skill combinations (select, insert, physics QA) |
+| Composite | `composite` | 22 | Multi-step reasoning and long-horizon planning (cook, rearrange) |
+
+**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`).
+
+**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`.
+
+`--env.task` accepts three forms:
+
+- a single task name (`select_fruit`)
+- a comma-separated list (`select_fruit,heat_food`)
+- a suite shortcut (`primitive`, `composite`, or `primitive,composite`)
+
+## Installation
+
+VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets:
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench
+git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms
+pip install -e ~/VLABench -e ~/rrt-algorithms
+pip install "mujoco==3.2.2" "dm-control==1.0.22" \
+ open3d colorlog scikit-learn openai gdown
+
+python ~/VLABench/scripts/download_assets.py
+```
+
+
+VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running:
+
+```bash
+export MUJOCO_GL=egl # for headless servers (HPC, cloud)
+```
+
+
+
+## Evaluation
+
+All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_vlabench \
+ --env.type=vlabench \
+ --env.task=select_fruit \
+ --eval.batch_size=1 \
+ --eval.n_episodes=10 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_vlabench \
+ --env.type=vlabench \
+ --env.task=select_fruit,select_toy,add_condiment,heat_food \
+ --eval.batch_size=1 \
+ --eval.n_episodes=10 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Suite-wide evaluation
+
+Run an entire suite (all 21 primitives or all 22 composites):
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_vlabench \
+ --env.type=vlabench \
+ --env.task=primitive \
+ --eval.batch_size=1 \
+ --eval.n_episodes=10 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ --env.max_parallel_tasks=1 \
+ '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+Or both suites:
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_vlabench \
+ --env.type=vlabench \
+ --env.task=primitive,composite \
+ --eval.batch_size=1 \
+ --eval.n_episodes=10 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ --env.max_parallel_tasks=1 \
+ '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper.
+
+## Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper)
+- `observation.images.image` — front camera, 480×480 HWC uint8
+- `observation.images.second_image` — second camera, 480×480 HWC uint8
+- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper.
+
+## Training
+
+### Datasets
+
+Pre-collected VLABench datasets in LeRobot format on the Hub:
+
+- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images.
+- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images.
+
+### Example training command
+
+Fine-tune a SmolVLA base on the primitive suite:
+
+```bash
+lerobot-train \
+ --policy.type=smolvla \
+ --policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \
+ --policy.load_vlm_weights=true \
+ --policy.push_to_hub=true \
+ --dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \
+ --env.type=vlabench \
+ --env.task=select_fruit \
+ --output_dir=./outputs/smolvla_vlabench_primitive \
+ --steps=100000 \
+ --batch_size=4 \
+ --eval_freq=5000 \
+ --eval.batch_size=1 \
+ --eval.n_episodes=1 \
+ --save_freq=10000
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark.
diff --git a/pyproject.toml b/pyproject.toml
index dbc866a49..790c7f2d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -212,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
+# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
+# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
+# release), so any `vlabench>=X` pip spec is unresolvable. Install it
+# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx
+# for the recipe.
# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
# resolve into a single env. Install it only in the RoboMME Docker image
diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py
index 3bdc9035f..0d27885cf 100644
--- a/scripts/ci/extract_task_descriptions.py
+++ b/scripts/ci/extract_task_descriptions.py
@@ -142,6 +142,21 @@ def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) ->
return out
+def _vlabench_descriptions(task_spec: str) -> dict[str, str]:
+ """For each task in the comma-separated list, emit a cleaned-name label.
+
+ VLABench tasks carry language instructions on their dm_control task
+ object, but pulling them requires loading the full env per task
+ (~seconds each). The CI smoke-eval already captures the instruction
+ inside its episode info; this mapping is just enough to key
+ `metrics.json` by `_0`.
+ """
+ out: dict[str, str] = {}
+ for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+ out[f"{task}_0"] = task.replace("_", " ").strip()
+ return out
+
+
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
@@ -171,6 +186,8 @@ def main() -> int:
descriptions = _robocasa_descriptions(args.task)
elif args.env == "robomme":
descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
+ elif args.env == "vlabench":
+ descriptions = _vlabench_descriptions(args.task)
else:
print(
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py
index 5ce06b475..84c40472f 100644
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -573,6 +573,71 @@ class RoboCasaEnv(EnvConfig):
)
+@EnvConfig.register_subclass("vlabench")
+@dataclass
+class VLABenchEnv(EnvConfig):
+ task: str = "select_fruit"
+ fps: int = 10
+ episode_length: int = 500
+ obs_type: str = "pixels_agent_pos"
+ render_mode: str = "rgb_array"
+ render_resolution: tuple[int, int] = (480, 480)
+ robot: str = "franka"
+ action_mode: str = "eef"
+ features: dict[str, PolicyFeature] = field(
+ default_factory=lambda: {
+ ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
+ }
+ )
+ features_map: dict[str, str] = field(
+ default_factory=lambda: {
+ ACTION: ACTION,
+ "agent_pos": OBS_STATE,
+ "pixels/image": f"{OBS_IMAGES}.image",
+ "pixels/second_image": f"{OBS_IMAGES}.second_image",
+ "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
+ }
+ )
+
+ def __post_init__(self):
+ h, w = self.render_resolution
+ if self.obs_type == "pixels":
+ self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ elif self.obs_type == "pixels_agent_pos":
+ self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+ self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,))
+ else:
+ raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+ @property
+ def gym_kwargs(self) -> dict:
+ return {
+ "obs_type": self.obs_type,
+ "render_mode": self.render_mode,
+ "render_resolution": self.render_resolution,
+ "robot": self.robot,
+ "max_episode_steps": self.episode_length,
+ "action_mode": self.action_mode,
+ }
+
+ def create_envs(self, n_envs: int, use_async_envs: bool = False):
+ from .vlabench import create_vlabench_envs
+
+ if self.task is None:
+ raise ValueError("VLABenchEnv requires a task to be specified")
+ env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+ return create_vlabench_envs(
+ task=self.task,
+ n_envs=n_envs,
+ gym_kwargs=self.gym_kwargs,
+ env_cls=env_cls,
+ )
+
+
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
class IsaaclabArenaEnv(HubEnvConfig):
diff --git a/src/lerobot/envs/vlabench.py b/src/lerobot/envs/vlabench.py
new file mode 100644
index 000000000..922973a16
--- /dev/null
+++ b/src/lerobot/envs/vlabench.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VLABench environment wrapper for LeRobot.
+
+VLABench is a large-scale benchmark for language-conditioned robotic manipulation
+with long-horizon reasoning, built on MuJoCo/dm_control.
+
+- Paper: https://arxiv.org/abs/2412.18194
+- GitHub: https://github.com/OpenMOSS/VLABench
+- Website: https://vlabench.github.io
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from typing import Any
+
+import cv2
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+from scipy.spatial.transform import Rotation
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+ACTION_DIM = 7 # pos(3) + euler(3) + gripper(1)
+ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32)
+ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
+
+# Default max episode steps per task type
+DEFAULT_MAX_EPISODE_STEPS = 500
+
+# VLABench task suites
+PRIMITIVE_TASKS = [
+ "select_fruit",
+ "select_toy",
+ "select_chemistry_tube",
+ "add_condiment",
+ "select_book",
+ "select_painting",
+ "select_drink",
+ "insert_flower",
+ "select_billiards",
+ "select_ingredient",
+ "select_mahjong",
+ "select_poker",
+ # Physical series
+ "density_qa",
+ "friction_qa",
+ "magnetism_qa",
+ "reflection_qa",
+ "simple_cuestick_usage",
+ "simple_seesaw_usage",
+ "sound_speed_qa",
+ "thermal_expansion_qa",
+ "weight_qa",
+]
+
+COMPOSITE_TASKS = [
+ "cluster_billiards",
+ "cluster_book",
+ "cluster_drink",
+ "cluster_toy",
+ "cook_dishes",
+ "cool_drink",
+ "find_unseen_object",
+ "get_coffee",
+ "hammer_nail",
+ "heat_food",
+ "make_juice",
+ "play_mahjong",
+ "play_math_game",
+ "play_poker",
+ "play_snooker",
+ "rearrange_book",
+ "rearrange_chemistry_tube",
+ "set_dining_table",
+ "set_study_table",
+ "store_food",
+ "take_chemistry_experiment",
+ "use_seesaw_complex",
+]
+
+SUITE_TASKS: dict[str, list[str]] = {
+ "primitive": PRIMITIVE_TASKS,
+ "composite": COMPOSITE_TASKS,
+}
+
+
+class VLABenchEnv(gym.Env):
+ """Gymnasium wrapper for VLABench environments.
+
+ Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface.
+ Supports multiple cameras (front, second, wrist) and end-effector control.
+ """
+
+ metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+ def __init__(
+ self,
+ task: str = "select_fruit",
+ obs_type: str = "pixels_agent_pos",
+ render_mode: str = "rgb_array",
+ render_resolution: tuple[int, int] = (480, 480),
+ robot: str = "franka",
+ max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS,
+ action_mode: str = "eef",
+ ):
+ super().__init__()
+ self.task = task
+ self.obs_type = obs_type
+ self.render_mode = render_mode
+ self.render_resolution = render_resolution
+ self.robot = robot
+ self._max_episode_steps = max_episode_steps
+ self.action_mode = action_mode
+
+ # Deferred — created on first reset() inside worker subprocess to avoid
+ # inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers.
+ # We never cache `env.physics`: dm_control exposes it as a weakref
+ # proxy that goes stale across resets (rebuilds the sim), so we always
+ # refetch it via `self._env.physics` at the call site.
+ self._env = None
+ self.task_description = "" # populated on first reset
+ # Cached world-frame XYZ of the robot base link. The VLABench datasets
+ # log both `observation.state` positions and `actions` positions in
+ # robot-base frame (see VLABench/scripts/convert_to_lerobot.py which
+ # subtracts `robot_frame_pos` from ee_pos). The robot is attached at a
+ # fixed offset per task so this is safe to cache once per env build.
+ self._robot_base_xyz: np.ndarray | None = None
+
+ h, w = self.render_resolution
+
+ if self.obs_type == "state":
+ raise NotImplementedError(
+ "The 'state' observation type is not supported in VLABenchEnv. "
+ "Please use 'pixels' or 'pixels_agent_pos'."
+ )
+ elif self.obs_type == "pixels":
+ self.observation_space = spaces.Dict(
+ {
+ "pixels": spaces.Dict(
+ {
+ "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ }
+ ),
+ }
+ )
+ elif self.obs_type == "pixels_agent_pos":
+ self.observation_space = spaces.Dict(
+ {
+ "pixels": spaces.Dict(
+ {
+ "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+ }
+ ),
+ "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64),
+ }
+ )
+ else:
+ raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+ self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32)
+
+ # Max attempts to rebuild the underlying env when MuJoCo throws
+ # `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step
+ # reset warm-up. Some random task/layout samples land in unstable
+ # initial configurations; re-sampling the layout almost always
+ # gives a stable one. A handful of upstream tasks (notably
+ # `select_mahjong`) have layout samplers that diverge often enough
+ # to need >>5 retries, so we pick a generous ceiling.
+ _ENSURE_ENV_MAX_ATTEMPTS = 20
+
+ def _ensure_env(self) -> None:
+ """Create the underlying VLABench env on first use.
+
+ Called inside the worker subprocess after fork(), so each worker gets
+ its own clean rendering context rather than inheriting a stale one from
+ the parent process (which causes crashes with AsyncVectorEnv).
+
+ Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20
+ warm-up `step()` calls while toggling gravity/fluids to let the scene
+ settle; for some random layouts MuJoCo's integrator diverges and
+ raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields
+ a stable one, so we retry a number of times before giving up. Between
+ attempts we reseed NumPy's global RNG from OS entropy so the upstream
+ task sampler explores fresh initial states — without this, retries
+ can replay the same diverging configuration when the sampler is
+ deterministic given the current RNG state.
+ """
+ if self._env is not None:
+ return
+
+ import VLABench.robots # noqa: F401 # type: ignore[import-untyped]
+ import VLABench.tasks # noqa: F401 # type: ignore[import-untyped]
+ from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
+ from VLABench.envs import load_env # type: ignore[import-untyped]
+
+ h, w = self.render_resolution
+ last_exc: PhysicsError | None = None
+ for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1):
+ try:
+ env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w))
+ self._env = env
+ break
+ except PhysicsError as exc:
+ last_exc = exc
+ logger.warning(
+ "PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…",
+ attempt,
+ self._ENSURE_ENV_MAX_ATTEMPTS,
+ self.task,
+ exc,
+ )
+ np.random.seed(None)
+ if self._env is None:
+ assert last_exc is not None
+ raise RuntimeError(
+ f"VLABench task '{self.task}' failed to produce a stable "
+ f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} "
+ f"attempts. This task's upstream sampler diverges too "
+ f"often for the configured robot; consider removing it "
+ f"from the eval set. Last physics error: {last_exc}"
+ ) from last_exc
+
+ # Extract task description from the dm_control task
+ task_obj = self._env.task
+ if hasattr(task_obj, "task_description"):
+ self.task_description = task_obj.task_description
+ elif hasattr(task_obj, "language_instruction"):
+ self.task_description = task_obj.language_instruction
+ else:
+ self.task_description = self.task
+
+ # Cache robot base world position so `_build_ctrl_from_action` and
+ # `_get_obs` can translate between robot-frame (dataset) and
+ # world-frame (dm_control) without hitting physics every call.
+ try:
+ self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape(
+ 3
+ )
+ except Exception:
+ # Fallback to VLABench's default Franka base position.
+ self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64)
+
+ def _get_obs(self) -> dict:
+ """Get current observation from the environment."""
+ assert self._env is not None
+
+ obs = self._env.get_observation()
+ h, w = self.render_resolution
+
+ def _to_hwc3(arr: np.ndarray) -> np.ndarray:
+ """Coerce any camera array to the declared (h, w, 3) uint8 shape."""
+ a = np.asarray(arr)
+ # Drop a leading singleton batch dim if present.
+ while a.ndim > 3 and a.shape[0] == 1:
+ a = a[0]
+ if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4):
+ # CHW → HWC
+ a = np.transpose(a, (1, 2, 0))
+ if a.ndim == 2:
+ a = np.stack([a] * 3, axis=-1)
+ if a.ndim != 3:
+ return np.zeros((h, w, 3), dtype=np.uint8)
+ # Force 3 channels.
+ if a.shape[-1] == 1:
+ a = np.repeat(a, 3, axis=-1)
+ elif a.shape[-1] == 4:
+ a = a[..., :3]
+ elif a.shape[-1] != 3:
+ return np.zeros((h, w, 3), dtype=np.uint8)
+ if a.shape[:2] != (h, w):
+ a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA)
+ return a.astype(np.uint8)
+
+ # Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays
+ raw_frames: list[np.ndarray] = []
+ if "rgb" in obs:
+ rgb = obs["rgb"]
+ if isinstance(rgb, np.ndarray):
+ if rgb.ndim == 4:
+ raw_frames = [rgb[i] for i in range(rgb.shape[0])]
+ elif rgb.ndim == 3:
+ raw_frames = [rgb]
+
+ image_keys = ["image", "second_image", "wrist_image"]
+ images: dict[str, np.ndarray] = {}
+ for i, key in enumerate(image_keys):
+ if i < len(raw_frames):
+ images[key] = _to_hwc3(raw_frames[i])
+ else:
+ images[key] = np.zeros((h, w, 3), dtype=np.uint8)
+
+ # Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]`
+ # to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3),
+ # gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions
+ # are stored in robot-base frame and orientations as scipy extrinsic
+ # 'xyz' euler angles.
+ raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel()
+ pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64)
+ quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64)
+ gripper = float(raw[7]) if raw.size >= 8 else 0.0
+
+ base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+ pos_robot = pos_world - base
+ euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler(
+ "xyz", degrees=False
+ )
+
+ ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64)
+
+ if self.obs_type == "pixels":
+ return {"pixels": images}
+ elif self.obs_type == "pixels_agent_pos":
+ return {
+ "pixels": images,
+ "agent_pos": ee_state.astype(np.float64),
+ }
+ else:
+ raise ValueError(f"Unknown obs_type: {self.obs_type}")
+
+ # ---- Action adaptation (EEF → joint ctrl) --------------------------------
+ #
+ # The HF vlabench datasets log 7D actions
+ # `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`,
+ # exactly matching VLABench's own eval pipeline (evaluator.base):
+ # pos, euler, g = policy(...)
+ # quat = euler_to_quaternion(*euler) # extrinsic xyz -> wxyz
+ # _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat)
+ # env.step(np.concatenate([qpos, [g, g]]))
+ #
+ # VLABench's dm_control task writes `data.ctrl[:] = action` directly — for
+ # Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the
+ # above conversion so the policy's EEF commands actually drive the robot.
+
+ _FRANKA_FINGER_OPEN = 0.04 # qpos when gripper fully open
+
+ def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray:
+ """Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector.
+
+ For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) +
+ 2 gripper finger qposes (open/closed based on the gripper scalar).
+ If the action is already joint-space (shape matches ctrl_dim), pass
+ through.
+ """
+ if action.shape[0] == ctrl_dim:
+ return action.astype(np.float64, copy=False)
+
+ if action.shape[0] != 7:
+ # Unknown layout — fall back to zero-pad so the sim doesn't crash.
+ padded = np.zeros(ctrl_dim, dtype=np.float64)
+ padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim]
+ return padded
+
+ from dm_control.utils.inverse_kinematics import qpos_from_site_pose
+
+ # Action position is in robot-base frame (see convert_to_lerobot.py);
+ # dm_control's IK expects a world-frame target.
+ base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+ pos_world = np.asarray(action[:3], dtype=np.float64) + base
+ rx, ry, rz = float(action[3]), float(action[4]), float(action[5])
+ gripper = float(np.clip(action[6], 0.0, 1.0))
+
+ # Dataset euler is scipy extrinsic 'xyz' (same as VLABench's
+ # `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK
+ # and MuJoCo use `[w, x, y, z]`, so reorder.
+ qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat()
+ quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64)
+
+ assert self._env is not None
+ robot = self._env.task.robot
+ site_name = robot.end_effector_site.full_identifier
+
+ # inplace=False so IK doesn't mutate physics state mid-step — we only
+ # want the solved qpos. Fetch a fresh physics handle — caching it can
+ # yield a stale weakref after a reset.
+ ik_result = qpos_from_site_pose(
+ self._env.physics,
+ site_name=site_name,
+ target_pos=pos_world,
+ target_quat=quat,
+ inplace=False,
+ max_steps=100,
+ )
+ n_dof = robot.n_dof # 7 for Franka
+ arm_qpos = ik_result.qpos[:n_dof]
+
+ # Dataset gripper convention: 1 = open (finger qpos = 0.04),
+ # 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py
+ # where `trajectory[i][-1] > 0.03` is encoded as `1`.
+ finger_qpos = gripper * self._FRANKA_FINGER_OPEN
+
+ ctrl = np.zeros(ctrl_dim, dtype=np.float64)
+ ctrl[:n_dof] = arm_qpos
+ # Remaining entries are gripper fingers (usually 2 for Franka).
+ ctrl[n_dof:] = finger_qpos
+ return ctrl
+
+ def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]:
+ self._ensure_env()
+ assert self._env is not None
+ super().reset(seed=seed)
+
+ if seed is not None:
+ self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1)))
+
+ self._env.reset()
+
+ observation = self._get_obs()
+ info = {"is_success": False}
+ return observation, info
+
+ def _seed_inner_env(self, seed: int) -> None:
+ """Propagate `seed` to the inner dm_control env. `Environment.reset()`
+ doesn't accept a seed, so we re-seed the task and environment
+ `RandomState`s directly. Best-effort: silently skipped when the
+ expected attributes are absent on a given VLABench version.
+ """
+ for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")):
+ owner = getattr(self._env, owner_attr) if owner_attr else self._env
+ rng = getattr(owner, rng_attr, None)
+ rng_seed = getattr(rng, "seed", None)
+ if callable(rng_seed):
+ rng_seed(seed)
+
+ def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+ from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
+
+ self._ensure_env()
+ assert self._env is not None
+
+ if action.ndim != 1:
+ raise ValueError(
+ f"Expected action to be 1-D (shape (action_dim,)), "
+ f"but got shape {action.shape} with ndim={action.ndim}"
+ )
+
+ if self.action_mode not in ("eef", "joint", "delta_eef"):
+ raise ValueError(f"Unknown action_mode: {self.action_mode}")
+
+ # Always refetch physics — dm_control returns a weakref proxy that can
+ # go stale across resets.
+ physics = self._env.physics
+ ctrl_dim = int(physics.data.ctrl.shape[0])
+ ctrl = self._build_ctrl_from_action(action, ctrl_dim)
+ try:
+ timestep = self._env.step(ctrl)
+ except PhysicsError as exc:
+ # Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as
+ # a graceful failed termination rather than a hard crash — the
+ # rest of the multi-task eval should still run.
+ logger.warning(
+ "PhysicsError during step on task '%s': %s. Terminating episode.",
+ self.task,
+ exc,
+ )
+ observation = self._get_obs()
+ info = {"task": self.task, "is_success": False, "physics_error": True}
+ # Drop the stale env so the next reset() rebuilds it cleanly.
+ with contextlib.suppress(Exception):
+ self._env.close()
+ self._env = None
+ return observation, 0.0, True, False, info
+
+ # Extract reward from dm_control timestep
+ reward = float(timestep.reward) if timestep.reward is not None else 0.0
+
+ # Check success via the task's termination condition
+ is_success = False
+ if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"):
+ is_success = bool(self._env.task.should_terminate_episode(self._env.physics))
+
+ terminated = is_success
+ truncated = False
+ info = {
+ "task": self.task,
+ "is_success": is_success,
+ }
+
+ observation = self._get_obs()
+
+ if terminated:
+ self.reset()
+
+ return observation, reward, terminated, truncated, info
+
+ def render(self) -> np.ndarray:
+ self._ensure_env()
+ obs = self._get_obs()
+ return obs["pixels"]["image"]
+
+ def close(self):
+ if self._env is not None:
+ self._env.close()
+ self._env = None
+
+
+# ---- Main API ----------------------------------------------------------------
+
+
+def create_vlabench_envs(
+ task: str,
+ n_envs: int,
+ gym_kwargs: dict[str, Any] | None = None,
+ env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, Any]]:
+ """
+ Create vectorized VLABench environments with a consistent return shape.
+
+ Returns:
+ dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+ Notes:
+ - n_envs is the number of rollouts *per task*.
+ - `task` can be a suite name ("primitive", "composite"), a comma-separated list of
+ suite names, or individual task names (e.g. "select_fruit,heat_food").
+ """
+ if env_cls is None or not callable(env_cls):
+ raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+ if not isinstance(n_envs, int) or n_envs <= 0:
+ raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+ gym_kwargs = dict(gym_kwargs or {})
+ task_groups = [t.strip() for t in task.split(",") if t.strip()]
+ if not task_groups:
+ raise ValueError("`task` must contain at least one VLABench task or suite name.")
+
+ logger.info(
+ "Creating VLABench envs | task_groups=%s | n_envs(per task)=%d",
+ task_groups,
+ n_envs,
+ )
+
+ is_async = env_cls is gym.vector.AsyncVectorEnv
+ cached_obs_space = None
+ cached_act_space = None
+ cached_metadata = None
+ out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+ for group in task_groups:
+ # Check if it's a suite name, otherwise treat as individual task
+ tasks = SUITE_TASKS.get(group, [group])
+
+ for tid, task_name in enumerate(tasks):
+ logger.info(
+ "Building vec env | group=%s | task_id=%d | task=%s",
+ group,
+ tid,
+ task_name,
+ )
+
+ fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
+
+ if is_async:
+ lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+ if cached_obs_space is None:
+ cached_obs_space = lazy.observation_space
+ cached_act_space = lazy.action_space
+ cached_metadata = lazy.metadata
+ out[group][tid] = lazy
+ else:
+ out[group][tid] = env_cls(fns)
+
+ return {group: dict(task_map) for group, task_map in out.items()}