diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 3a5d92c44..b07c8f8da 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -843,3 +843,103 @@ jobs: name: libero-plus-metrics path: /tmp/libero-plus-artifacts/metrics.json if-no-files-found: warn + + # ── VLABENCH ───────────────────────────────────────────────────────────── + # Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain) + vlabench-integration-test: + name: VLABench — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build VLABench benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.vlabench + push: false + load: true + tags: lerobot-benchmark-vlabench:ci + build-args: | + VLABENCH_ASSETS_REPO=lerobot/vlabench-assets + + - name: Run VLABench smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name vlabench-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e MUJOCO_GL=egl \ + lerobot-benchmark-vlabench:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env vlabench \ + --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy VLABench artifacts from container + if: always() + run: | + mkdir -p /tmp/vlabench-artifacts + docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true + docker rm -f vlabench-eval || true + + - name: Parse VLABench eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/vlabench-artifacts \ + --env vlabench \ + --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --policy lerobot/smolvla_vlabench + + - name: Upload VLABench rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: vlabench-rollout-video + path: /tmp/vlabench-artifacts/videos/ + if-no-files-found: warn + + - name: Upload VLABench eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: vlabench-metrics + path: /tmp/vlabench-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.vlabench b/docker/Dockerfile.benchmark.vlabench new file mode 100644 index 000000000..13502a3e3 --- /dev/null +++ b/docker/Dockerfile.benchmark.vlabench @@ -0,0 +1,99 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for VLABench integration tests. +# Extends the nightly GPU image with the PR's source code and VLABench setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench . +# Run: docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control. +# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI). +# Editable install (-e) because VLABench/utils/ has no __init__.py, so +# find_packages() omits it from wheels; editable mode uses the source tree directly. +# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py). +# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import +# time. Guard the call so missing dirs return [] instead of crashing (in case +# the asset download is partial). +# +# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need +# an upstream fix; don't rely on `main`/`develop` drift. +ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017 +ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85 +RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \ + git -C ~/VLABench checkout ${VLABENCH_SHA} && \ + git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \ + git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \ + python3 -c "\ +import pathlib; \ +p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \ +t = p.read_text(); \ +p.write_text(t.replace( \ + 'subdirs = os.listdir(xml_dir)', \ + 'if not os.path.isdir(xml_dir): return []\n subdirs = os.listdir(xml_dir)'))" && \ + uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \ + mujoco==3.2.2 dm-control==1.0.22 \ + open3d colorlog scikit-learn openai gdown + +# Download VLABench mesh assets. Task configs reference object meshes +# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without +# them the task builder picks from an empty mesh list and crashes with +# IndexError at task-build time (random.choice([]) in config_manager.py). +# +# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time +# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll +# snapshot_download the repo into VLABench's assets dir. This is the reliable +# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have +# viewed or downloaded this file recently") on shared academic files. +# +# After download we *validate* that at least one XML exists under each +# task-critical subtree and fail the build loudly if not. Silent-empty asset +# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them +# here rather than after a 10-minute eval build. +# +# Fallback: VLABench's own gdown-based script. Best-effort only. +ARG VLABENCH_ASSETS_REPO="" +RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \ + if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \ + echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \ + uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \ + python -c "from huggingface_hub import snapshot_download; \ +p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \ + local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \ +print('snapshot_download returned:', p)"; \ + else \ + echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \ + python ~/VLABench/scripts/download_assets.py --choice all; \ + fi && \ + python -c "\ +from pathlib import Path; \ +import sys; \ +root = Path('${ASSETS_DIR}'); \ +checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \ +failed = []; \ +print(f'Validating VLABench assets under {root}'); \ +[print(f' {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \ +[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \ +sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')" + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +# Re-install lerobot editably so the new source (with VLABenchEnv registration +# and updated obs handling) replaces the stale package baked into the nightly image. +RUN uv pip install --no-cache --no-deps -e . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index d29f4c545..f5e1129f3 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -91,6 +91,8 @@ title: RoboMME - local: envhub_isaaclab_arena title: NVIDIA IsaacLab Arena Environments + - local: vlabench + title: VLABench title: "Benchmarks" - sections: - local: introduction_processors diff --git a/docs/source/vlabench.mdx b/docs/source/vlabench.mdx new file mode 100644 index 000000000..da579d674 --- /dev/null +++ b/docs/source/vlabench.mdx @@ -0,0 +1,176 @@ +# VLABench + +[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below). + +- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194) +- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) +- Project website: [vlabench.github.io](https://vlabench.github.io) +- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) + +VLABench benchmark overview + +## Available tasks + +VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface: + +| Suite | CLI name | Tasks | Description | +| --------- | ----------- | ----- | ---------------------------------------------------------------- | +| Primitive | `primitive` | 21 | Single / few-skill combinations (select, insert, physics QA) | +| Composite | `composite` | 22 | Multi-step reasoning and long-horizon planning (cook, rearrange) | + +**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`). + +**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`. + +`--env.task` accepts three forms: + +- a single task name (`select_fruit`) +- a comma-separated list (`select_fruit,heat_food`) +- a suite shortcut (`primitive`, `composite`, or `primitive,composite`) + +## Installation + +VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets: + +```bash +# After following the standard LeRobot installation instructions. + +git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench +git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms +pip install -e ~/VLABench -e ~/rrt-algorithms +pip install "mujoco==3.2.2" "dm-control==1.0.22" \ + open3d colorlog scikit-learn openai gdown + +python ~/VLABench/scripts/download_assets.py +``` + + +VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +## Evaluation + +All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on. + +### Single-task evaluation (recommended for quick iteration) + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Multi-task evaluation + +Pass a comma-separated list of tasks: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit,select_toy,add_condiment,heat_food \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Suite-wide evaluation + +Run an entire suite (all 21 primitives or all 22 composites): + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=primitive \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + --env.max_parallel_tasks=1 \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +Or both suites: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=primitive,composite \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + --env.max_parallel_tasks=1 \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Recommended evaluation episodes + +**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper. + +## Policy inputs and outputs + +**Observations:** + +- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper) +- `observation.images.image` — front camera, 480×480 HWC uint8 +- `observation.images.second_image` — second camera, 480×480 HWC uint8 +- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper. + +## Training + +### Datasets + +Pre-collected VLABench datasets in LeRobot format on the Hub: + +- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images. +- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images. + +### Example training command + +Fine-tune a SmolVLA base on the primitive suite: + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \ + --policy.load_vlm_weights=true \ + --policy.push_to_hub=true \ + --dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \ + --env.type=vlabench \ + --env.task=select_fruit \ + --output_dir=./outputs/smolvla_vlabench_primitive \ + --steps=100000 \ + --batch_size=4 \ + --eval_freq=5000 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --save_freq=10000 +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark. diff --git a/pyproject.toml b/pyproject.toml index dbc866a49..790c7f2d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -212,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"] pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"] metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"] +# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution +# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI +# release), so any `vlabench>=X` pip spec is unresolvable. Install it +# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx +# for the recipe. # NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2 # which conflicts with lerobot's numpy>=2 base pin, so the two trees can't # resolve into a single env. Install it only in the RoboMME Docker image diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 3bdc9035f..0d27885cf 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -142,6 +142,21 @@ def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> return out +def _vlabench_descriptions(task_spec: str) -> dict[str, str]: + """For each task in the comma-separated list, emit a cleaned-name label. + + VLABench tasks carry language instructions on their dm_control task + object, but pulling them requires loading the full env per task + (~seconds each). The CI smoke-eval already captures the instruction + inside its episode info; this mapping is just enough to key + `metrics.json` by `_0`. + """ + out: dict[str, str] = {} + for task in (t.strip() for t in task_spec.split(",") if t.strip()): + out[f"{task}_0"] = task.replace("_", " ").strip() + return out + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") @@ -171,6 +186,8 @@ def main() -> int: descriptions = _robocasa_descriptions(args.task) elif args.env == "robomme": descriptions = _robomme_descriptions(args.task, task_ids=task_ids) + elif args.env == "vlabench": + descriptions = _vlabench_descriptions(args.task) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.", diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 5ce06b475..84c40472f 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -573,6 +573,71 @@ class RoboCasaEnv(EnvConfig): ) +@EnvConfig.register_subclass("vlabench") +@dataclass +class VLABenchEnv(EnvConfig): + task: str = "select_fruit" + fps: int = 10 + episode_length: int = 500 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + render_resolution: tuple[int, int] = (480, 480) + robot: str = "franka" + action_mode: str = "eef" + features: dict[str, PolicyFeature] = field( + default_factory=lambda: { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)), + } + ) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "agent_pos": OBS_STATE, + "pixels/image": f"{OBS_IMAGES}.image", + "pixels/second_image": f"{OBS_IMAGES}.second_image", + "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image", + } + ) + + def __post_init__(self): + h, w = self.render_resolution + if self.obs_type == "pixels": + self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + elif self.obs_type == "pixels_agent_pos": + self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,)) + else: + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + @property + def gym_kwargs(self) -> dict: + return { + "obs_type": self.obs_type, + "render_mode": self.render_mode, + "render_resolution": self.render_resolution, + "robot": self.robot, + "max_episode_steps": self.episode_length, + "action_mode": self.action_mode, + } + + def create_envs(self, n_envs: int, use_async_envs: bool = False): + from .vlabench import create_vlabench_envs + + if self.task is None: + raise ValueError("VLABenchEnv requires a task to be specified") + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_vlabench_envs( + task=self.task, + n_envs=n_envs, + gym_kwargs=self.gym_kwargs, + env_cls=env_cls, + ) + + @EnvConfig.register_subclass("isaaclab_arena") @dataclass class IsaaclabArenaEnv(HubEnvConfig): diff --git a/src/lerobot/envs/vlabench.py b/src/lerobot/envs/vlabench.py new file mode 100644 index 000000000..922973a16 --- /dev/null +++ b/src/lerobot/envs/vlabench.py @@ -0,0 +1,589 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VLABench environment wrapper for LeRobot. + +VLABench is a large-scale benchmark for language-conditioned robotic manipulation +with long-horizon reasoning, built on MuJoCo/dm_control. + +- Paper: https://arxiv.org/abs/2412.18194 +- GitHub: https://github.com/OpenMOSS/VLABench +- Website: https://vlabench.github.io +""" + +from __future__ import annotations + +import contextlib +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from typing import Any + +import cv2 +import gymnasium as gym +import numpy as np +from gymnasium import spaces +from scipy.spatial.transform import Rotation + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv + +logger = logging.getLogger(__name__) + +ACTION_DIM = 7 # pos(3) + euler(3) + gripper(1) +ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32) +ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32) + +# Default max episode steps per task type +DEFAULT_MAX_EPISODE_STEPS = 500 + +# VLABench task suites +PRIMITIVE_TASKS = [ + "select_fruit", + "select_toy", + "select_chemistry_tube", + "add_condiment", + "select_book", + "select_painting", + "select_drink", + "insert_flower", + "select_billiards", + "select_ingredient", + "select_mahjong", + "select_poker", + # Physical series + "density_qa", + "friction_qa", + "magnetism_qa", + "reflection_qa", + "simple_cuestick_usage", + "simple_seesaw_usage", + "sound_speed_qa", + "thermal_expansion_qa", + "weight_qa", +] + +COMPOSITE_TASKS = [ + "cluster_billiards", + "cluster_book", + "cluster_drink", + "cluster_toy", + "cook_dishes", + "cool_drink", + "find_unseen_object", + "get_coffee", + "hammer_nail", + "heat_food", + "make_juice", + "play_mahjong", + "play_math_game", + "play_poker", + "play_snooker", + "rearrange_book", + "rearrange_chemistry_tube", + "set_dining_table", + "set_study_table", + "store_food", + "take_chemistry_experiment", + "use_seesaw_complex", +] + +SUITE_TASKS: dict[str, list[str]] = { + "primitive": PRIMITIVE_TASKS, + "composite": COMPOSITE_TASKS, +} + + +class VLABenchEnv(gym.Env): + """Gymnasium wrapper for VLABench environments. + + Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface. + Supports multiple cameras (front, second, wrist) and end-effector control. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 10} + + def __init__( + self, + task: str = "select_fruit", + obs_type: str = "pixels_agent_pos", + render_mode: str = "rgb_array", + render_resolution: tuple[int, int] = (480, 480), + robot: str = "franka", + max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS, + action_mode: str = "eef", + ): + super().__init__() + self.task = task + self.obs_type = obs_type + self.render_mode = render_mode + self.render_resolution = render_resolution + self.robot = robot + self._max_episode_steps = max_episode_steps + self.action_mode = action_mode + + # Deferred — created on first reset() inside worker subprocess to avoid + # inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers. + # We never cache `env.physics`: dm_control exposes it as a weakref + # proxy that goes stale across resets (rebuilds the sim), so we always + # refetch it via `self._env.physics` at the call site. + self._env = None + self.task_description = "" # populated on first reset + # Cached world-frame XYZ of the robot base link. The VLABench datasets + # log both `observation.state` positions and `actions` positions in + # robot-base frame (see VLABench/scripts/convert_to_lerobot.py which + # subtracts `robot_frame_pos` from ee_pos). The robot is attached at a + # fixed offset per task so this is safe to cache once per env build. + self._robot_base_xyz: np.ndarray | None = None + + h, w = self.render_resolution + + if self.obs_type == "state": + raise NotImplementedError( + "The 'state' observation type is not supported in VLABenchEnv. " + "Please use 'pixels' or 'pixels_agent_pos'." + ) + elif self.obs_type == "pixels": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + } + ), + } + ) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + } + ), + "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64), + } + ) + else: + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32) + + # Max attempts to rebuild the underlying env when MuJoCo throws + # `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step + # reset warm-up. Some random task/layout samples land in unstable + # initial configurations; re-sampling the layout almost always + # gives a stable one. A handful of upstream tasks (notably + # `select_mahjong`) have layout samplers that diverge often enough + # to need >>5 retries, so we pick a generous ceiling. + _ENSURE_ENV_MAX_ATTEMPTS = 20 + + def _ensure_env(self) -> None: + """Create the underlying VLABench env on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own clean rendering context rather than inheriting a stale one from + the parent process (which causes crashes with AsyncVectorEnv). + + Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20 + warm-up `step()` calls while toggling gravity/fluids to let the scene + settle; for some random layouts MuJoCo's integrator diverges and + raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields + a stable one, so we retry a number of times before giving up. Between + attempts we reseed NumPy's global RNG from OS entropy so the upstream + task sampler explores fresh initial states — without this, retries + can replay the same diverging configuration when the sampler is + deterministic given the current RNG state. + """ + if self._env is not None: + return + + import VLABench.robots # noqa: F401 # type: ignore[import-untyped] + import VLABench.tasks # noqa: F401 # type: ignore[import-untyped] + from dm_control.rl.control import PhysicsError # type: ignore[import-untyped] + from VLABench.envs import load_env # type: ignore[import-untyped] + + h, w = self.render_resolution + last_exc: PhysicsError | None = None + for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1): + try: + env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w)) + self._env = env + break + except PhysicsError as exc: + last_exc = exc + logger.warning( + "PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…", + attempt, + self._ENSURE_ENV_MAX_ATTEMPTS, + self.task, + exc, + ) + np.random.seed(None) + if self._env is None: + assert last_exc is not None + raise RuntimeError( + f"VLABench task '{self.task}' failed to produce a stable " + f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} " + f"attempts. This task's upstream sampler diverges too " + f"often for the configured robot; consider removing it " + f"from the eval set. Last physics error: {last_exc}" + ) from last_exc + + # Extract task description from the dm_control task + task_obj = self._env.task + if hasattr(task_obj, "task_description"): + self.task_description = task_obj.task_description + elif hasattr(task_obj, "language_instruction"): + self.task_description = task_obj.language_instruction + else: + self.task_description = self.task + + # Cache robot base world position so `_build_ctrl_from_action` and + # `_get_obs` can translate between robot-frame (dataset) and + # world-frame (dm_control) without hitting physics every call. + try: + self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape( + 3 + ) + except Exception: + # Fallback to VLABench's default Franka base position. + self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64) + + def _get_obs(self) -> dict: + """Get current observation from the environment.""" + assert self._env is not None + + obs = self._env.get_observation() + h, w = self.render_resolution + + def _to_hwc3(arr: np.ndarray) -> np.ndarray: + """Coerce any camera array to the declared (h, w, 3) uint8 shape.""" + a = np.asarray(arr) + # Drop a leading singleton batch dim if present. + while a.ndim > 3 and a.shape[0] == 1: + a = a[0] + if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4): + # CHW → HWC + a = np.transpose(a, (1, 2, 0)) + if a.ndim == 2: + a = np.stack([a] * 3, axis=-1) + if a.ndim != 3: + return np.zeros((h, w, 3), dtype=np.uint8) + # Force 3 channels. + if a.shape[-1] == 1: + a = np.repeat(a, 3, axis=-1) + elif a.shape[-1] == 4: + a = a[..., :3] + elif a.shape[-1] != 3: + return np.zeros((h, w, 3), dtype=np.uint8) + if a.shape[:2] != (h, w): + a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA) + return a.astype(np.uint8) + + # Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays + raw_frames: list[np.ndarray] = [] + if "rgb" in obs: + rgb = obs["rgb"] + if isinstance(rgb, np.ndarray): + if rgb.ndim == 4: + raw_frames = [rgb[i] for i in range(rgb.shape[0])] + elif rgb.ndim == 3: + raw_frames = [rgb] + + image_keys = ["image", "second_image", "wrist_image"] + images: dict[str, np.ndarray] = {} + for i, key in enumerate(image_keys): + if i < len(raw_frames): + images[key] = _to_hwc3(raw_frames[i]) + else: + images[key] = np.zeros((h, w, 3), dtype=np.uint8) + + # Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]` + # to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3), + # gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions + # are stored in robot-base frame and orientations as scipy extrinsic + # 'xyz' euler angles. + raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel() + pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64) + quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64) + gripper = float(raw[7]) if raw.size >= 8 else 0.0 + + base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64) + pos_robot = pos_world - base + euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler( + "xyz", degrees=False + ) + + ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64) + + if self.obs_type == "pixels": + return {"pixels": images} + elif self.obs_type == "pixels_agent_pos": + return { + "pixels": images, + "agent_pos": ee_state.astype(np.float64), + } + else: + raise ValueError(f"Unknown obs_type: {self.obs_type}") + + # ---- Action adaptation (EEF → joint ctrl) -------------------------------- + # + # The HF vlabench datasets log 7D actions + # `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`, + # exactly matching VLABench's own eval pipeline (evaluator.base): + # pos, euler, g = policy(...) + # quat = euler_to_quaternion(*euler) # extrinsic xyz -> wxyz + # _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat) + # env.step(np.concatenate([qpos, [g, g]])) + # + # VLABench's dm_control task writes `data.ctrl[:] = action` directly — for + # Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the + # above conversion so the policy's EEF commands actually drive the robot. + + _FRANKA_FINGER_OPEN = 0.04 # qpos when gripper fully open + + def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray: + """Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector. + + For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) + + 2 gripper finger qposes (open/closed based on the gripper scalar). + If the action is already joint-space (shape matches ctrl_dim), pass + through. + """ + if action.shape[0] == ctrl_dim: + return action.astype(np.float64, copy=False) + + if action.shape[0] != 7: + # Unknown layout — fall back to zero-pad so the sim doesn't crash. + padded = np.zeros(ctrl_dim, dtype=np.float64) + padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim] + return padded + + from dm_control.utils.inverse_kinematics import qpos_from_site_pose + + # Action position is in robot-base frame (see convert_to_lerobot.py); + # dm_control's IK expects a world-frame target. + base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64) + pos_world = np.asarray(action[:3], dtype=np.float64) + base + rx, ry, rz = float(action[3]), float(action[4]), float(action[5]) + gripper = float(np.clip(action[6], 0.0, 1.0)) + + # Dataset euler is scipy extrinsic 'xyz' (same as VLABench's + # `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK + # and MuJoCo use `[w, x, y, z]`, so reorder. + qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat() + quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64) + + assert self._env is not None + robot = self._env.task.robot + site_name = robot.end_effector_site.full_identifier + + # inplace=False so IK doesn't mutate physics state mid-step — we only + # want the solved qpos. Fetch a fresh physics handle — caching it can + # yield a stale weakref after a reset. + ik_result = qpos_from_site_pose( + self._env.physics, + site_name=site_name, + target_pos=pos_world, + target_quat=quat, + inplace=False, + max_steps=100, + ) + n_dof = robot.n_dof # 7 for Franka + arm_qpos = ik_result.qpos[:n_dof] + + # Dataset gripper convention: 1 = open (finger qpos = 0.04), + # 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py + # where `trajectory[i][-1] > 0.03` is encoded as `1`. + finger_qpos = gripper * self._FRANKA_FINGER_OPEN + + ctrl = np.zeros(ctrl_dim, dtype=np.float64) + ctrl[:n_dof] = arm_qpos + # Remaining entries are gripper fingers (usually 2 for Franka). + ctrl[n_dof:] = finger_qpos + return ctrl + + def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]: + self._ensure_env() + assert self._env is not None + super().reset(seed=seed) + + if seed is not None: + self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1))) + + self._env.reset() + + observation = self._get_obs() + info = {"is_success": False} + return observation, info + + def _seed_inner_env(self, seed: int) -> None: + """Propagate `seed` to the inner dm_control env. `Environment.reset()` + doesn't accept a seed, so we re-seed the task and environment + `RandomState`s directly. Best-effort: silently skipped when the + expected attributes are absent on a given VLABench version. + """ + for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")): + owner = getattr(self._env, owner_attr) if owner_attr else self._env + rng = getattr(owner, rng_attr, None) + rng_seed = getattr(rng, "seed", None) + if callable(rng_seed): + rng_seed(seed) + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + from dm_control.rl.control import PhysicsError # type: ignore[import-untyped] + + self._ensure_env() + assert self._env is not None + + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) + + if self.action_mode not in ("eef", "joint", "delta_eef"): + raise ValueError(f"Unknown action_mode: {self.action_mode}") + + # Always refetch physics — dm_control returns a weakref proxy that can + # go stale across resets. + physics = self._env.physics + ctrl_dim = int(physics.data.ctrl.shape[0]) + ctrl = self._build_ctrl_from_action(action, ctrl_dim) + try: + timestep = self._env.step(ctrl) + except PhysicsError as exc: + # Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as + # a graceful failed termination rather than a hard crash — the + # rest of the multi-task eval should still run. + logger.warning( + "PhysicsError during step on task '%s': %s. Terminating episode.", + self.task, + exc, + ) + observation = self._get_obs() + info = {"task": self.task, "is_success": False, "physics_error": True} + # Drop the stale env so the next reset() rebuilds it cleanly. + with contextlib.suppress(Exception): + self._env.close() + self._env = None + return observation, 0.0, True, False, info + + # Extract reward from dm_control timestep + reward = float(timestep.reward) if timestep.reward is not None else 0.0 + + # Check success via the task's termination condition + is_success = False + if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"): + is_success = bool(self._env.task.should_terminate_episode(self._env.physics)) + + terminated = is_success + truncated = False + info = { + "task": self.task, + "is_success": is_success, + } + + observation = self._get_obs() + + if terminated: + self.reset() + + return observation, reward, terminated, truncated, info + + def render(self) -> np.ndarray: + self._ensure_env() + obs = self._get_obs() + return obs["pixels"]["image"] + + def close(self): + if self._env is not None: + self._env.close() + self._env = None + + +# ---- Main API ---------------------------------------------------------------- + + +def create_vlabench_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] | None = None, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, +) -> dict[str, dict[int, Any]]: + """ + Create vectorized VLABench environments with a consistent return shape. + + Returns: + dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + + Notes: + - n_envs is the number of rollouts *per task*. + - `task` can be a suite name ("primitive", "composite"), a comma-separated list of + suite names, or individual task names (e.g. "select_fruit,heat_food"). + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + gym_kwargs = dict(gym_kwargs or {}) + task_groups = [t.strip() for t in task.split(",") if t.strip()] + if not task_groups: + raise ValueError("`task` must contain at least one VLABench task or suite name.") + + logger.info( + "Creating VLABench envs | task_groups=%s | n_envs(per task)=%d", + task_groups, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space = None + cached_act_space = None + cached_metadata = None + out: dict[str, dict[int, Any]] = defaultdict(dict) + + for group in task_groups: + # Check if it's a suite name, otherwise treat as individual task + tasks = SUITE_TASKS.get(group, [group]) + + for tid, task_name in enumerate(tasks): + logger.info( + "Building vec env | group=%s | task_id=%d | task=%s", + group, + tid, + task_name, + ) + + fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)] + + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[group][tid] = lazy + else: + out[group][tid] = env_cls(fns) + + return {group: dict(task_map) for group, task_map in out.items()}