mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-29 22:27:14 +00:00
Merge branch 'main' into codex/model-profiling
This commit is contained in:
@@ -843,3 +843,103 @@ jobs:
|
||||
name: libero-plus-metrics
|
||||
path: /tmp/libero-plus-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── VLABENCH ─────────────────────────────────────────────────────────────
|
||||
# Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain)
|
||||
vlabench-integration-test:
|
||||
name: VLABench — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build VLABench benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.vlabench
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-vlabench:ci
|
||||
build-args: |
|
||||
VLABENCH_ASSETS_REPO=lerobot/vlabench-assets
|
||||
|
||||
- name: Run VLABench smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name vlabench-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e MUJOCO_GL=egl \
|
||||
lerobot-benchmark-vlabench:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env vlabench \
|
||||
--task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy VLABench artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/vlabench-artifacts
|
||||
docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true
|
||||
docker rm -f vlabench-eval || true
|
||||
|
||||
- name: Parse VLABench eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/vlabench-artifacts \
|
||||
--env vlabench \
|
||||
--task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--policy lerobot/smolvla_vlabench
|
||||
|
||||
- name: Upload VLABench rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: vlabench-rollout-video
|
||||
path: /tmp/vlabench-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload VLABench eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: vlabench-metrics
|
||||
path: /tmp/vlabench-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for VLABench integration tests.
|
||||
# Extends the nightly GPU image with the PR's source code and VLABench setup.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control.
|
||||
# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI).
|
||||
# Editable install (-e) because VLABench/utils/ has no __init__.py, so
|
||||
# find_packages() omits it from wheels; editable mode uses the source tree directly.
|
||||
# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py).
|
||||
# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import
|
||||
# time. Guard the call so missing dirs return [] instead of crashing (in case
|
||||
# the asset download is partial).
|
||||
#
|
||||
# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need
|
||||
# an upstream fix; don't rely on `main`/`develop` drift.
|
||||
ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017
|
||||
ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85
|
||||
RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \
|
||||
git -C ~/VLABench checkout ${VLABENCH_SHA} && \
|
||||
git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \
|
||||
git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \
|
||||
python3 -c "\
|
||||
import pathlib; \
|
||||
p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \
|
||||
t = p.read_text(); \
|
||||
p.write_text(t.replace( \
|
||||
'subdirs = os.listdir(xml_dir)', \
|
||||
'if not os.path.isdir(xml_dir): return []\n subdirs = os.listdir(xml_dir)'))" && \
|
||||
uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \
|
||||
mujoco==3.2.2 dm-control==1.0.22 \
|
||||
open3d colorlog scikit-learn openai gdown
|
||||
|
||||
# Download VLABench mesh assets. Task configs reference object meshes
|
||||
# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without
|
||||
# them the task builder picks from an empty mesh list and crashes with
|
||||
# IndexError at task-build time (random.choice([]) in config_manager.py).
|
||||
#
|
||||
# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time
|
||||
# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll
|
||||
# snapshot_download the repo into VLABench's assets dir. This is the reliable
|
||||
# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have
|
||||
# viewed or downloaded this file recently") on shared academic files.
|
||||
#
|
||||
# After download we *validate* that at least one XML exists under each
|
||||
# task-critical subtree and fail the build loudly if not. Silent-empty asset
|
||||
# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them
|
||||
# here rather than after a 10-minute eval build.
|
||||
#
|
||||
# Fallback: VLABench's own gdown-based script. Best-effort only.
|
||||
ARG VLABENCH_ASSETS_REPO=""
|
||||
RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \
|
||||
if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \
|
||||
echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \
|
||||
uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \
|
||||
python -c "from huggingface_hub import snapshot_download; \
|
||||
p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \
|
||||
local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \
|
||||
print('snapshot_download returned:', p)"; \
|
||||
else \
|
||||
echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \
|
||||
python ~/VLABench/scripts/download_assets.py --choice all; \
|
||||
fi && \
|
||||
python -c "\
|
||||
from pathlib import Path; \
|
||||
import sys; \
|
||||
root = Path('${ASSETS_DIR}'); \
|
||||
checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \
|
||||
failed = []; \
|
||||
print(f'Validating VLABench assets under {root}'); \
|
||||
[print(f' {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \
|
||||
[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \
|
||||
sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')"
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
# Re-install lerobot editably so the new source (with VLABenchEnv registration
|
||||
# and updated obs handling) replaces the stale package baked into the nightly image.
|
||||
RUN uv pip install --no-cache --no-deps -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -91,6 +91,8 @@
|
||||
title: RoboMME
|
||||
- local: envhub_isaaclab_arena
|
||||
title: NVIDIA IsaacLab Arena Environments
|
||||
- local: vlabench
|
||||
title: VLABench
|
||||
title: "Benchmarks"
|
||||
- sections:
|
||||
- local: introduction_processors
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
# VLABench
|
||||
|
||||
[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below).
|
||||
|
||||
- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194)
|
||||
- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench)
|
||||
- Project website: [vlabench.github.io](https://vlabench.github.io)
|
||||
- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench)
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/vlabench.png"
|
||||
alt="VLABench benchmark overview"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
## Available tasks
|
||||
|
||||
VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface:
|
||||
|
||||
| Suite | CLI name | Tasks | Description |
|
||||
| --------- | ----------- | ----- | ---------------------------------------------------------------- |
|
||||
| Primitive | `primitive` | 21 | Single / few-skill combinations (select, insert, physics QA) |
|
||||
| Composite | `composite` | 22 | Multi-step reasoning and long-horizon planning (cook, rearrange) |
|
||||
|
||||
**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`).
|
||||
|
||||
**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`.
|
||||
|
||||
`--env.task` accepts three forms:
|
||||
|
||||
- a single task name (`select_fruit`)
|
||||
- a comma-separated list (`select_fruit,heat_food`)
|
||||
- a suite shortcut (`primitive`, `composite`, or `primitive,composite`)
|
||||
|
||||
## Installation
|
||||
|
||||
VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets:
|
||||
|
||||
```bash
|
||||
# After following the standard LeRobot installation instructions.
|
||||
|
||||
git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench
|
||||
git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms
|
||||
pip install -e ~/VLABench -e ~/rrt-algorithms
|
||||
pip install "mujoco==3.2.2" "dm-control==1.0.22" \
|
||||
open3d colorlog scikit-learn openai gdown
|
||||
|
||||
python ~/VLABench/scripts/download_assets.py
|
||||
```
|
||||
|
||||
<Tip>
|
||||
VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on.
|
||||
|
||||
### Single-task evaluation (recommended for quick iteration)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Pass a comma-separated list of tasks:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit,select_toy,add_condiment,heat_food \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Suite-wide evaluation
|
||||
|
||||
Run an entire suite (all 21 primitives or all 22 composites):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=primitive \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
--env.max_parallel_tasks=1 \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
Or both suites:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=primitive,composite \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
--env.max_parallel_tasks=1 \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper)
|
||||
- `observation.images.image` — front camera, 480×480 HWC uint8
|
||||
- `observation.images.second_image` — second camera, 480×480 HWC uint8
|
||||
- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper.
|
||||
|
||||
## Training
|
||||
|
||||
### Datasets
|
||||
|
||||
Pre-collected VLABench datasets in LeRobot format on the Hub:
|
||||
|
||||
- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images.
|
||||
- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images.
|
||||
|
||||
### Example training command
|
||||
|
||||
Fine-tune a SmolVLA base on the primitive suite:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \
|
||||
--policy.load_vlm_weights=true \
|
||||
--policy.push_to_hub=true \
|
||||
--dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit \
|
||||
--output_dir=./outputs/smolvla_vlabench_primitive \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval_freq=5000 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--save_freq=10000
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark.
|
||||
@@ -212,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
|
||||
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
|
||||
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
|
||||
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
|
||||
# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
|
||||
# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
|
||||
# release), so any `vlabench>=X` pip spec is unresolvable. Install it
|
||||
# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx
|
||||
# for the recipe.
|
||||
# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
|
||||
# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
|
||||
# resolve into a single env. Install it only in the RoboMME Docker image
|
||||
|
||||
@@ -142,6 +142,21 @@ def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) ->
|
||||
return out
|
||||
|
||||
|
||||
def _vlabench_descriptions(task_spec: str) -> dict[str, str]:
|
||||
"""For each task in the comma-separated list, emit a cleaned-name label.
|
||||
|
||||
VLABench tasks carry language instructions on their dm_control task
|
||||
object, but pulling them requires loading the full env per task
|
||||
(~seconds each). The CI smoke-eval already captures the instruction
|
||||
inside its episode info; this mapping is just enough to key
|
||||
`metrics.json` by `<task>_0`.
|
||||
"""
|
||||
out: dict[str, str] = {}
|
||||
for task in (t.strip() for t in task_spec.split(",") if t.strip()):
|
||||
out[f"{task}_0"] = task.replace("_", " ").strip()
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
|
||||
@@ -171,6 +186,8 @@ def main() -> int:
|
||||
descriptions = _robocasa_descriptions(args.task)
|
||||
elif args.env == "robomme":
|
||||
descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
|
||||
elif args.env == "vlabench":
|
||||
descriptions = _vlabench_descriptions(args.task)
|
||||
else:
|
||||
print(
|
||||
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
|
||||
|
||||
@@ -573,6 +573,71 @@ class RoboCasaEnv(EnvConfig):
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("vlabench")
|
||||
@dataclass
|
||||
class VLABenchEnv(EnvConfig):
|
||||
task: str = "select_fruit"
|
||||
fps: int = 10
|
||||
episode_length: int = 500
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
render_resolution: tuple[int, int] = (480, 480)
|
||||
robot: str = "franka"
|
||||
action_mode: str = "eef"
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"agent_pos": OBS_STATE,
|
||||
"pixels/image": f"{OBS_IMAGES}.image",
|
||||
"pixels/second_image": f"{OBS_IMAGES}.second_image",
|
||||
"pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
h, w = self.render_resolution
|
||||
if self.obs_type == "pixels":
|
||||
self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,))
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {
|
||||
"obs_type": self.obs_type,
|
||||
"render_mode": self.render_mode,
|
||||
"render_resolution": self.render_resolution,
|
||||
"robot": self.robot,
|
||||
"max_episode_steps": self.episode_length,
|
||||
"action_mode": self.action_mode,
|
||||
}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = False):
|
||||
from .vlabench import create_vlabench_envs
|
||||
|
||||
if self.task is None:
|
||||
raise ValueError("VLABenchEnv requires a task to be specified")
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_vlabench_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
gym_kwargs=self.gym_kwargs,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("isaaclab_arena")
|
||||
@dataclass
|
||||
class IsaaclabArenaEnv(HubEnvConfig):
|
||||
|
||||
@@ -0,0 +1,589 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""VLABench environment wrapper for LeRobot.
|
||||
|
||||
VLABench is a large-scale benchmark for language-conditioned robotic manipulation
|
||||
with long-horizon reasoning, built on MuJoCo/dm_control.
|
||||
|
||||
- Paper: https://arxiv.org/abs/2412.18194
|
||||
- GitHub: https://github.com/OpenMOSS/VLABench
|
||||
- Website: https://vlabench.github.io
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from typing import Any
|
||||
|
||||
import cv2
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
from scipy.spatial.transform import Rotation
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ACTION_DIM = 7 # pos(3) + euler(3) + gripper(1)
|
||||
ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32)
|
||||
ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
|
||||
|
||||
# Default max episode steps per task type
|
||||
DEFAULT_MAX_EPISODE_STEPS = 500
|
||||
|
||||
# VLABench task suites
|
||||
PRIMITIVE_TASKS = [
|
||||
"select_fruit",
|
||||
"select_toy",
|
||||
"select_chemistry_tube",
|
||||
"add_condiment",
|
||||
"select_book",
|
||||
"select_painting",
|
||||
"select_drink",
|
||||
"insert_flower",
|
||||
"select_billiards",
|
||||
"select_ingredient",
|
||||
"select_mahjong",
|
||||
"select_poker",
|
||||
# Physical series
|
||||
"density_qa",
|
||||
"friction_qa",
|
||||
"magnetism_qa",
|
||||
"reflection_qa",
|
||||
"simple_cuestick_usage",
|
||||
"simple_seesaw_usage",
|
||||
"sound_speed_qa",
|
||||
"thermal_expansion_qa",
|
||||
"weight_qa",
|
||||
]
|
||||
|
||||
COMPOSITE_TASKS = [
|
||||
"cluster_billiards",
|
||||
"cluster_book",
|
||||
"cluster_drink",
|
||||
"cluster_toy",
|
||||
"cook_dishes",
|
||||
"cool_drink",
|
||||
"find_unseen_object",
|
||||
"get_coffee",
|
||||
"hammer_nail",
|
||||
"heat_food",
|
||||
"make_juice",
|
||||
"play_mahjong",
|
||||
"play_math_game",
|
||||
"play_poker",
|
||||
"play_snooker",
|
||||
"rearrange_book",
|
||||
"rearrange_chemistry_tube",
|
||||
"set_dining_table",
|
||||
"set_study_table",
|
||||
"store_food",
|
||||
"take_chemistry_experiment",
|
||||
"use_seesaw_complex",
|
||||
]
|
||||
|
||||
SUITE_TASKS: dict[str, list[str]] = {
|
||||
"primitive": PRIMITIVE_TASKS,
|
||||
"composite": COMPOSITE_TASKS,
|
||||
}
|
||||
|
||||
|
||||
class VLABenchEnv(gym.Env):
|
||||
"""Gymnasium wrapper for VLABench environments.
|
||||
|
||||
Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface.
|
||||
Supports multiple cameras (front, second, wrist) and end-effector control.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str = "select_fruit",
|
||||
obs_type: str = "pixels_agent_pos",
|
||||
render_mode: str = "rgb_array",
|
||||
render_resolution: tuple[int, int] = (480, 480),
|
||||
robot: str = "franka",
|
||||
max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS,
|
||||
action_mode: str = "eef",
|
||||
):
|
||||
super().__init__()
|
||||
self.task = task
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.render_resolution = render_resolution
|
||||
self.robot = robot
|
||||
self._max_episode_steps = max_episode_steps
|
||||
self.action_mode = action_mode
|
||||
|
||||
# Deferred — created on first reset() inside worker subprocess to avoid
|
||||
# inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers.
|
||||
# We never cache `env.physics`: dm_control exposes it as a weakref
|
||||
# proxy that goes stale across resets (rebuilds the sim), so we always
|
||||
# refetch it via `self._env.physics` at the call site.
|
||||
self._env = None
|
||||
self.task_description = "" # populated on first reset
|
||||
# Cached world-frame XYZ of the robot base link. The VLABench datasets
|
||||
# log both `observation.state` positions and `actions` positions in
|
||||
# robot-base frame (see VLABench/scripts/convert_to_lerobot.py which
|
||||
# subtracts `robot_frame_pos` from ee_pos). The robot is attached at a
|
||||
# fixed offset per task so this is safe to cache once per env build.
|
||||
self._robot_base_xyz: np.ndarray | None = None
|
||||
|
||||
h, w = self.render_resolution
|
||||
|
||||
if self.obs_type == "state":
|
||||
raise NotImplementedError(
|
||||
"The 'state' observation type is not supported in VLABenchEnv. "
|
||||
"Please use 'pixels' or 'pixels_agent_pos'."
|
||||
)
|
||||
elif self.obs_type == "pixels":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
}
|
||||
)
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64),
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32)
|
||||
|
||||
# Max attempts to rebuild the underlying env when MuJoCo throws
|
||||
# `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step
|
||||
# reset warm-up. Some random task/layout samples land in unstable
|
||||
# initial configurations; re-sampling the layout almost always
|
||||
# gives a stable one. A handful of upstream tasks (notably
|
||||
# `select_mahjong`) have layout samplers that diverge often enough
|
||||
# to need >>5 retries, so we pick a generous ceiling.
|
||||
_ENSURE_ENV_MAX_ATTEMPTS = 20
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the underlying VLABench env on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own clean rendering context rather than inheriting a stale one from
|
||||
the parent process (which causes crashes with AsyncVectorEnv).
|
||||
|
||||
Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20
|
||||
warm-up `step()` calls while toggling gravity/fluids to let the scene
|
||||
settle; for some random layouts MuJoCo's integrator diverges and
|
||||
raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields
|
||||
a stable one, so we retry a number of times before giving up. Between
|
||||
attempts we reseed NumPy's global RNG from OS entropy so the upstream
|
||||
task sampler explores fresh initial states — without this, retries
|
||||
can replay the same diverging configuration when the sampler is
|
||||
deterministic given the current RNG state.
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
|
||||
import VLABench.robots # noqa: F401 # type: ignore[import-untyped]
|
||||
import VLABench.tasks # noqa: F401 # type: ignore[import-untyped]
|
||||
from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
|
||||
from VLABench.envs import load_env # type: ignore[import-untyped]
|
||||
|
||||
h, w = self.render_resolution
|
||||
last_exc: PhysicsError | None = None
|
||||
for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w))
|
||||
self._env = env
|
||||
break
|
||||
except PhysicsError as exc:
|
||||
last_exc = exc
|
||||
logger.warning(
|
||||
"PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…",
|
||||
attempt,
|
||||
self._ENSURE_ENV_MAX_ATTEMPTS,
|
||||
self.task,
|
||||
exc,
|
||||
)
|
||||
np.random.seed(None)
|
||||
if self._env is None:
|
||||
assert last_exc is not None
|
||||
raise RuntimeError(
|
||||
f"VLABench task '{self.task}' failed to produce a stable "
|
||||
f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} "
|
||||
f"attempts. This task's upstream sampler diverges too "
|
||||
f"often for the configured robot; consider removing it "
|
||||
f"from the eval set. Last physics error: {last_exc}"
|
||||
) from last_exc
|
||||
|
||||
# Extract task description from the dm_control task
|
||||
task_obj = self._env.task
|
||||
if hasattr(task_obj, "task_description"):
|
||||
self.task_description = task_obj.task_description
|
||||
elif hasattr(task_obj, "language_instruction"):
|
||||
self.task_description = task_obj.language_instruction
|
||||
else:
|
||||
self.task_description = self.task
|
||||
|
||||
# Cache robot base world position so `_build_ctrl_from_action` and
|
||||
# `_get_obs` can translate between robot-frame (dataset) and
|
||||
# world-frame (dm_control) without hitting physics every call.
|
||||
try:
|
||||
self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape(
|
||||
3
|
||||
)
|
||||
except Exception:
|
||||
# Fallback to VLABench's default Franka base position.
|
||||
self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64)
|
||||
|
||||
def _get_obs(self) -> dict:
|
||||
"""Get current observation from the environment."""
|
||||
assert self._env is not None
|
||||
|
||||
obs = self._env.get_observation()
|
||||
h, w = self.render_resolution
|
||||
|
||||
def _to_hwc3(arr: np.ndarray) -> np.ndarray:
|
||||
"""Coerce any camera array to the declared (h, w, 3) uint8 shape."""
|
||||
a = np.asarray(arr)
|
||||
# Drop a leading singleton batch dim if present.
|
||||
while a.ndim > 3 and a.shape[0] == 1:
|
||||
a = a[0]
|
||||
if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4):
|
||||
# CHW → HWC
|
||||
a = np.transpose(a, (1, 2, 0))
|
||||
if a.ndim == 2:
|
||||
a = np.stack([a] * 3, axis=-1)
|
||||
if a.ndim != 3:
|
||||
return np.zeros((h, w, 3), dtype=np.uint8)
|
||||
# Force 3 channels.
|
||||
if a.shape[-1] == 1:
|
||||
a = np.repeat(a, 3, axis=-1)
|
||||
elif a.shape[-1] == 4:
|
||||
a = a[..., :3]
|
||||
elif a.shape[-1] != 3:
|
||||
return np.zeros((h, w, 3), dtype=np.uint8)
|
||||
if a.shape[:2] != (h, w):
|
||||
a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA)
|
||||
return a.astype(np.uint8)
|
||||
|
||||
# Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays
|
||||
raw_frames: list[np.ndarray] = []
|
||||
if "rgb" in obs:
|
||||
rgb = obs["rgb"]
|
||||
if isinstance(rgb, np.ndarray):
|
||||
if rgb.ndim == 4:
|
||||
raw_frames = [rgb[i] for i in range(rgb.shape[0])]
|
||||
elif rgb.ndim == 3:
|
||||
raw_frames = [rgb]
|
||||
|
||||
image_keys = ["image", "second_image", "wrist_image"]
|
||||
images: dict[str, np.ndarray] = {}
|
||||
for i, key in enumerate(image_keys):
|
||||
if i < len(raw_frames):
|
||||
images[key] = _to_hwc3(raw_frames[i])
|
||||
else:
|
||||
images[key] = np.zeros((h, w, 3), dtype=np.uint8)
|
||||
|
||||
# Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]`
|
||||
# to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3),
|
||||
# gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions
|
||||
# are stored in robot-base frame and orientations as scipy extrinsic
|
||||
# 'xyz' euler angles.
|
||||
raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel()
|
||||
pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64)
|
||||
quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64)
|
||||
gripper = float(raw[7]) if raw.size >= 8 else 0.0
|
||||
|
||||
base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
|
||||
pos_robot = pos_world - base
|
||||
euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler(
|
||||
"xyz", degrees=False
|
||||
)
|
||||
|
||||
ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64)
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
return {"pixels": images}
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
return {
|
||||
"pixels": images,
|
||||
"agent_pos": ee_state.astype(np.float64),
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown obs_type: {self.obs_type}")
|
||||
|
||||
# ---- Action adaptation (EEF → joint ctrl) --------------------------------
|
||||
#
|
||||
# The HF vlabench datasets log 7D actions
|
||||
# `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`,
|
||||
# exactly matching VLABench's own eval pipeline (evaluator.base):
|
||||
# pos, euler, g = policy(...)
|
||||
# quat = euler_to_quaternion(*euler) # extrinsic xyz -> wxyz
|
||||
# _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat)
|
||||
# env.step(np.concatenate([qpos, [g, g]]))
|
||||
#
|
||||
# VLABench's dm_control task writes `data.ctrl[:] = action` directly — for
|
||||
# Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the
|
||||
# above conversion so the policy's EEF commands actually drive the robot.
|
||||
|
||||
_FRANKA_FINGER_OPEN = 0.04 # qpos when gripper fully open
|
||||
|
||||
def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray:
|
||||
"""Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector.
|
||||
|
||||
For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) +
|
||||
2 gripper finger qposes (open/closed based on the gripper scalar).
|
||||
If the action is already joint-space (shape matches ctrl_dim), pass
|
||||
through.
|
||||
"""
|
||||
if action.shape[0] == ctrl_dim:
|
||||
return action.astype(np.float64, copy=False)
|
||||
|
||||
if action.shape[0] != 7:
|
||||
# Unknown layout — fall back to zero-pad so the sim doesn't crash.
|
||||
padded = np.zeros(ctrl_dim, dtype=np.float64)
|
||||
padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim]
|
||||
return padded
|
||||
|
||||
from dm_control.utils.inverse_kinematics import qpos_from_site_pose
|
||||
|
||||
# Action position is in robot-base frame (see convert_to_lerobot.py);
|
||||
# dm_control's IK expects a world-frame target.
|
||||
base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
|
||||
pos_world = np.asarray(action[:3], dtype=np.float64) + base
|
||||
rx, ry, rz = float(action[3]), float(action[4]), float(action[5])
|
||||
gripper = float(np.clip(action[6], 0.0, 1.0))
|
||||
|
||||
# Dataset euler is scipy extrinsic 'xyz' (same as VLABench's
|
||||
# `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK
|
||||
# and MuJoCo use `[w, x, y, z]`, so reorder.
|
||||
qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat()
|
||||
quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64)
|
||||
|
||||
assert self._env is not None
|
||||
robot = self._env.task.robot
|
||||
site_name = robot.end_effector_site.full_identifier
|
||||
|
||||
# inplace=False so IK doesn't mutate physics state mid-step — we only
|
||||
# want the solved qpos. Fetch a fresh physics handle — caching it can
|
||||
# yield a stale weakref after a reset.
|
||||
ik_result = qpos_from_site_pose(
|
||||
self._env.physics,
|
||||
site_name=site_name,
|
||||
target_pos=pos_world,
|
||||
target_quat=quat,
|
||||
inplace=False,
|
||||
max_steps=100,
|
||||
)
|
||||
n_dof = robot.n_dof # 7 for Franka
|
||||
arm_qpos = ik_result.qpos[:n_dof]
|
||||
|
||||
# Dataset gripper convention: 1 = open (finger qpos = 0.04),
|
||||
# 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py
|
||||
# where `trajectory[i][-1] > 0.03` is encoded as `1`.
|
||||
finger_qpos = gripper * self._FRANKA_FINGER_OPEN
|
||||
|
||||
ctrl = np.zeros(ctrl_dim, dtype=np.float64)
|
||||
ctrl[:n_dof] = arm_qpos
|
||||
# Remaining entries are gripper fingers (usually 2 for Franka).
|
||||
ctrl[n_dof:] = finger_qpos
|
||||
return ctrl
|
||||
|
||||
def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
super().reset(seed=seed)
|
||||
|
||||
if seed is not None:
|
||||
self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1)))
|
||||
|
||||
self._env.reset()
|
||||
|
||||
observation = self._get_obs()
|
||||
info = {"is_success": False}
|
||||
return observation, info
|
||||
|
||||
def _seed_inner_env(self, seed: int) -> None:
|
||||
"""Propagate `seed` to the inner dm_control env. `Environment.reset()`
|
||||
doesn't accept a seed, so we re-seed the task and environment
|
||||
`RandomState`s directly. Best-effort: silently skipped when the
|
||||
expected attributes are absent on a given VLABench version.
|
||||
"""
|
||||
for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")):
|
||||
owner = getattr(self._env, owner_attr) if owner_attr else self._env
|
||||
rng = getattr(owner, rng_attr, None)
|
||||
rng_seed = getattr(rng, "seed", None)
|
||||
if callable(rng_seed):
|
||||
rng_seed(seed)
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
|
||||
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
|
||||
if action.ndim != 1:
|
||||
raise ValueError(
|
||||
f"Expected action to be 1-D (shape (action_dim,)), "
|
||||
f"but got shape {action.shape} with ndim={action.ndim}"
|
||||
)
|
||||
|
||||
if self.action_mode not in ("eef", "joint", "delta_eef"):
|
||||
raise ValueError(f"Unknown action_mode: {self.action_mode}")
|
||||
|
||||
# Always refetch physics — dm_control returns a weakref proxy that can
|
||||
# go stale across resets.
|
||||
physics = self._env.physics
|
||||
ctrl_dim = int(physics.data.ctrl.shape[0])
|
||||
ctrl = self._build_ctrl_from_action(action, ctrl_dim)
|
||||
try:
|
||||
timestep = self._env.step(ctrl)
|
||||
except PhysicsError as exc:
|
||||
# Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as
|
||||
# a graceful failed termination rather than a hard crash — the
|
||||
# rest of the multi-task eval should still run.
|
||||
logger.warning(
|
||||
"PhysicsError during step on task '%s': %s. Terminating episode.",
|
||||
self.task,
|
||||
exc,
|
||||
)
|
||||
observation = self._get_obs()
|
||||
info = {"task": self.task, "is_success": False, "physics_error": True}
|
||||
# Drop the stale env so the next reset() rebuilds it cleanly.
|
||||
with contextlib.suppress(Exception):
|
||||
self._env.close()
|
||||
self._env = None
|
||||
return observation, 0.0, True, False, info
|
||||
|
||||
# Extract reward from dm_control timestep
|
||||
reward = float(timestep.reward) if timestep.reward is not None else 0.0
|
||||
|
||||
# Check success via the task's termination condition
|
||||
is_success = False
|
||||
if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"):
|
||||
is_success = bool(self._env.task.should_terminate_episode(self._env.physics))
|
||||
|
||||
terminated = is_success
|
||||
truncated = False
|
||||
info = {
|
||||
"task": self.task,
|
||||
"is_success": is_success,
|
||||
}
|
||||
|
||||
observation = self._get_obs()
|
||||
|
||||
if terminated:
|
||||
self.reset()
|
||||
|
||||
return observation, reward, terminated, truncated, info
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
obs = self._get_obs()
|
||||
return obs["pixels"]["image"]
|
||||
|
||||
def close(self):
|
||||
if self._env is not None:
|
||||
self._env.close()
|
||||
self._env = None
|
||||
|
||||
|
||||
# ---- Main API ----------------------------------------------------------------
|
||||
|
||||
|
||||
def create_vlabench_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
gym_kwargs: dict[str, Any] | None = None,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""
|
||||
Create vectorized VLABench environments with a consistent return shape.
|
||||
|
||||
Returns:
|
||||
dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
|
||||
|
||||
Notes:
|
||||
- n_envs is the number of rollouts *per task*.
|
||||
- `task` can be a suite name ("primitive", "composite"), a comma-separated list of
|
||||
suite names, or individual task names (e.g. "select_fruit,heat_food").
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
task_groups = [t.strip() for t in task.split(",") if t.strip()]
|
||||
if not task_groups:
|
||||
raise ValueError("`task` must contain at least one VLABench task or suite name.")
|
||||
|
||||
logger.info(
|
||||
"Creating VLABench envs | task_groups=%s | n_envs(per task)=%d",
|
||||
task_groups,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space = None
|
||||
cached_act_space = None
|
||||
cached_metadata = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for group in task_groups:
|
||||
# Check if it's a suite name, otherwise treat as individual task
|
||||
tasks = SUITE_TASKS.get(group, [group])
|
||||
|
||||
for tid, task_name in enumerate(tasks):
|
||||
logger.info(
|
||||
"Building vec env | group=%s | task_id=%d | task=%s",
|
||||
group,
|
||||
tid,
|
||||
task_name,
|
||||
)
|
||||
|
||||
fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[group][tid] = lazy
|
||||
else:
|
||||
out[group][tid] = env_cls(fns)
|
||||
|
||||
return {group: dict(task_map) for group, task_map in out.items()}
|
||||
Reference in New Issue
Block a user