mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-25 13:40:00 +00:00
feat(envs): add RoboTwin 2.0 benchmark integration
- RoboTwinEnvConfig with 4-camera setup (head/front/left_wrist/right_wrist) - Docker image with SAPIEN, mplib, CuRobo, pytorch3d (Python 3.12) - CI workflow: 1-episode smoke eval with pepijn223/smolvla_robotwin - RoboTwinProcessorStep for state float32 casting - Camera rename_map: head_camera/front_camera/left_wrist -> camera1/2/3 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -310,3 +310,94 @@ jobs:
|
||||
name: metaworld-metrics
|
||||
path: /tmp/metaworld-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
|
||||
# Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
|
||||
# pytorch3d, + simulation assets (~4 GB).
|
||||
# Build takes ~20 min on first run; subsequent runs hit the layer cache.
|
||||
# Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
|
||||
robotwin-integration-test:
|
||||
name: RoboTwin 2.0 — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
# Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
|
||||
# simulation assets (~4 GB). Layer cache lives in the runner's local
|
||||
# Docker daemon — reused across re-runs on the same machine.
|
||||
- name: Build RoboTwin 2.0 benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robotwin
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robotwin:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-robotwin
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
|
||||
|
||||
- name: Run RoboTwin 2.0 smoke eval (1 episode)
|
||||
run: |
|
||||
# Named container (no --rm) so we can docker cp artifacts out.
|
||||
docker run --name robotwin-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
lerobot-benchmark-robotwin:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
cd /opt/robotwin && lerobot-eval \
|
||||
--policy.path=pepijn223/smolvla_robotwin \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.front_camera\": \"observation.images.camera2\", \"observation.images.left_wrist\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
"
|
||||
|
||||
- name: Copy RoboTwin artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robotwin-artifacts
|
||||
docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robotwin-eval || true
|
||||
|
||||
- name: Parse RoboTwin eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robotwin-artifacts \
|
||||
--env robotwin \
|
||||
--task beat_block_hammer \
|
||||
--policy pepijn223/smolvla_robotwin
|
||||
|
||||
- name: Upload RoboTwin rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-rollout-video
|
||||
path: /tmp/robotwin-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboTwin eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-metrics
|
||||
path: /tmp/robotwin-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Isolated benchmark image for RoboTwin 2.0 integration tests.
|
||||
# Full installation: SAPIEN, mplib, CuRobo, pytorch3d + simulation assets.
|
||||
#
|
||||
# RoboTwin install guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html
|
||||
# Assets from: https://huggingface.co/datasets/TianxingChen/RoboTwin2.0
|
||||
# - embodiments.zip ~220 MB
|
||||
# - objects.zip ~3.74 GB
|
||||
# (background_texture.zip ~11 GB is skipped — not required for a smoke eval)
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \
|
||||
# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
|
||||
|
||||
# RoboTwin requires CUDA devel image for CuRobo compilation (nvcc needed).
|
||||
# Python 3.12 required by lerobot (type alias syntax). open3d 0.19.0 has cp312 wheels.
|
||||
ARG CUDA_VERSION=12.1.1
|
||||
ARG OS_VERSION=22.04
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION}
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PATH=/lerobot/.venv/bin:$PATH \
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
DEVICE=cuda \
|
||||
# NVIDIA Container Toolkit: expose all driver capabilities (includes Vulkan)
|
||||
NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
# SAPIEN uses Vulkan; point at the NVIDIA ICD we create below
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
# RoboTwin task modules live in <repo>/envs/; add to PYTHONPATH after clone.
|
||||
ROBOTWIN_ROOT=/opt/robotwin
|
||||
|
||||
# System deps — extended with cmake/ninja for CuRobo and pytorch3d compilation.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
software-properties-common build-essential git curl \
|
||||
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
|
||||
libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
|
||||
cmake pkg-config ninja-build \
|
||||
libvulkan1 vulkan-tools \
|
||||
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
&& curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \
|
||||
&& mv /root/.local/bin/uv /usr/local/bin/uv \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& useradd --create-home --shell /bin/bash user_lerobot \
|
||||
&& usermod -aG sudo user_lerobot \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /lerobot
|
||||
RUN chown -R user_lerobot:user_lerobot /lerobot
|
||||
USER user_lerobot
|
||||
|
||||
ENV HOME=/home/user_lerobot \
|
||||
HF_HOME=/home/user_lerobot/.cache/huggingface \
|
||||
HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
|
||||
TORCH_HOME=/home/user_lerobot/.cache/torch \
|
||||
TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
|
||||
|
||||
RUN uv venv --python python${PYTHON_VERSION}
|
||||
|
||||
# ── 1. Install base lerobot ────────────────────────────────────────────────
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
COPY --chown=user_lerobot:user_lerobot src/ src/
|
||||
|
||||
# Install lerobot base only (no benchmark extras — RoboTwin is source-only).
|
||||
# Install lerobot with smolvla extra (for transformers dep needed by SmolVLA policy)
|
||||
RUN uv pip install --no-cache -e ".[smolvla]"
|
||||
|
||||
# ── 2. Install RoboTwin 2.0 simulator stack ────────────────────────────────
|
||||
# Clone at a pinned commit for reproducibility.
|
||||
USER root
|
||||
RUN git clone --depth=1 https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
|
||||
&& chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT}
|
||||
USER user_lerobot
|
||||
|
||||
# Install RoboTwin-specific packages on top of the lerobot venv.
|
||||
# We intentionally skip: torch/torchvision (already in lerobot), gymnasium
|
||||
# (lerobot uses 1.x; RoboTwin's API is wrapped so 0.29 is not needed), scipy
|
||||
# (version-pinned by lerobot), huggingface_hub (pinned by lerobot), wandb,
|
||||
# azure, openai, pyglet (UI only).
|
||||
# Install RoboTwin-specific packages. Use --python to pin resolution to the
|
||||
# venv's Python 3.10 (uv sync --locked may have changed the default target).
|
||||
RUN uv pip install --no-cache --python .venv/bin/python \
|
||||
"sapien==3.0.0b1" \
|
||||
"mplib==0.2.1" \
|
||||
"transforms3d==0.4.2" \
|
||||
"trimesh==4.4.3" \
|
||||
"open3d==0.19.0" \
|
||||
"imageio==2.34.2" \
|
||||
"termcolor" \
|
||||
"zarr" \
|
||||
"pydantic" \
|
||||
"h5py"
|
||||
|
||||
# pytorch3d — must be built from source (no universal wheel available).
|
||||
# This is the slowest step (~10 min); cached in subsequent builds.
|
||||
RUN uv pip install --no-cache --no-build-isolation --python .venv/bin/python \
|
||||
"git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
|
||||
# CuRobo — NVIDIA motion generation library; requires nvcc (devel image).
|
||||
# TORCH_CUDA_ARCH_LIST must be set or the build fails with empty arch list.
|
||||
RUN cd ${ROBOTWIN_ROOT}/envs \
|
||||
&& git clone --depth=1 https://github.com/NVlabs/curobo.git \
|
||||
&& cd curobo \
|
||||
&& TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
|
||||
uv pip install -e . --no-build-isolation --no-cache
|
||||
|
||||
# ── 3. Apply upstream patches (mirrors script/_install.sh) ─────────────────
|
||||
# patch 1: mplib — remove `or collide` from planner.py line 807.
|
||||
RUN python - <<'EOF'
|
||||
import re, pathlib, site
|
||||
for d in site.getsitepackages():
|
||||
p = pathlib.Path(d) / "mplib" / "planner.py"
|
||||
if p.exists():
|
||||
src = p.read_text()
|
||||
patched = re.sub(r"\bor collide\b", "", src, count=1)
|
||||
p.write_text(patched)
|
||||
print(f"mplib patch applied: {p}")
|
||||
break
|
||||
EOF
|
||||
|
||||
# patch 2: sapien URDF loader — add UTF-8 encoding + fix .srdf extension.
|
||||
RUN python - <<'EOF'
|
||||
import pathlib, site
|
||||
for d in site.getsitepackages():
|
||||
p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
|
||||
if p.exists():
|
||||
src = p.read_text()
|
||||
# add encoding='utf-8' to open() calls that lack it
|
||||
patched = src.replace("open(", "open(").replace(
|
||||
'with open(srdf_path) as f:', 'with open(srdf_path, encoding="utf-8") as f:'
|
||||
).replace('"srdf"', '".srdf"')
|
||||
p.write_text(patched)
|
||||
print(f"sapien patch applied: {p}")
|
||||
break
|
||||
EOF
|
||||
|
||||
# ── 4. Download simulation assets from HuggingFace ─────────────────────────
|
||||
# embodiments.zip (~220 MB) + objects.zip (~3.74 GB).
|
||||
# background_texture.zip (~11 GB) is skipped — not required for a smoke eval.
|
||||
# Set HF_TOKEN (passed as --build-arg or via --secret) for authenticated access.
|
||||
ARG HF_TOKEN=""
|
||||
RUN python - <<'EOF'
|
||||
import os, pathlib, zipfile
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
token = os.environ.get("HF_TOKEN") or None
|
||||
assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
|
||||
assets_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for fname in ("embodiments.zip", "objects.zip"):
|
||||
print(f"Downloading {fname} ...")
|
||||
local = hf_hub_download(
|
||||
repo_id="TianxingChen/RoboTwin2.0",
|
||||
repo_type="dataset",
|
||||
filename=fname,
|
||||
token=token,
|
||||
local_dir=str(assets_dir),
|
||||
)
|
||||
print(f"Extracting {fname} ...")
|
||||
with zipfile.ZipFile(local, "r") as z:
|
||||
z.extractall(str(assets_dir))
|
||||
pathlib.Path(local).unlink() # remove zip after extraction
|
||||
print(f"{fname} done.")
|
||||
EOF
|
||||
|
||||
# Update embodiment config paths to reflect the installation directory.
|
||||
RUN cd ${ROBOTWIN_ROOT} \
|
||||
&& python script/update_embodiment_config_path.py
|
||||
|
||||
# ── 5. Finalise ────────────────────────────────────────────────────────────
|
||||
# Expose RoboTwin task modules on PYTHONPATH so `import envs.<task>` works.
|
||||
ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}"
|
||||
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas 2>/dev/null || true
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -0,0 +1,206 @@
|
||||
# RoboTwin 2.0
|
||||
|
||||
RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 60 tasks with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
|
||||
|
||||
- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://robotwin-platform.github.io)
|
||||
- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
|
||||
- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
|
||||
- Dataset: [hxma/RoboTwin-LeRobot-v3.0](https://huggingface.co/datasets/hxma/RoboTwin-LeRobot-v3.0)
|
||||
|
||||
## Overview
|
||||
|
||||
| Property | Value |
|
||||
| ------------- | ---------------------------------------------------------- |
|
||||
| Tasks | 60 dual-arm manipulation tasks |
|
||||
| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) |
|
||||
| Action space | 14-dim joint-space, continuous in `[-1, 1]` |
|
||||
| Cameras | `head_camera`, `front_camera`, `left_wrist`, `right_wrist` |
|
||||
| Simulator | SAPIEN (not MuJoCo) |
|
||||
| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations |
|
||||
| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboTwin 2.0 ships with 60 dual-arm manipulation tasks. The full list appears on the [leaderboard](https://robotwin-platform.github.io/leaderboard). Example tasks:
|
||||
|
||||
| Task | CLI name | Category |
|
||||
| ---------------------- | ------------------------ | ---------------- |
|
||||
| Beat block with hammer | `beat_block_hammer` | Tool use |
|
||||
| Open / close laptop | `open_laptop` | Articulated obj |
|
||||
| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking |
|
||||
| Pour water | `pour_water` | Deformable/fluid |
|
||||
| Fold cloth | `fold_cloth` | Deformable |
|
||||
| Handover block | `handover_block` | Bimanual coord. |
|
||||
| Place shoes | `place_shoes_left/right` | Precision place |
|
||||
| Scan object | `scan_object` | Mobile manip. |
|
||||
|
||||
Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
|
||||
|
||||
## Dataset
|
||||
|
||||
The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
|
||||
|
||||
```
|
||||
hxma/RoboTwin-LeRobot-v3.0
|
||||
```
|
||||
|
||||
It contains over 100,000 pre-collected trajectories across all 60 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
|
||||
|
||||
You can load it directly with the HF Datasets library:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hxma/RoboTwin-LeRobot-v3.0", split="train")
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
|
||||
|
||||
### 1. Create a conda environment
|
||||
|
||||
```bash
|
||||
conda create -n robotwin python=3.10 -y
|
||||
conda activate robotwin
|
||||
```
|
||||
|
||||
### 2. Install LeRobot
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/lerobot.git
|
||||
cd lerobot
|
||||
pip install -e "."
|
||||
```
|
||||
|
||||
### 3. Install RoboTwin 2.0
|
||||
|
||||
```bash
|
||||
git clone https://github.com/RoboTwin-Platform/RoboTwin.git
|
||||
cd RoboTwin
|
||||
bash script/_install.sh
|
||||
bash script/_download_assets.sh
|
||||
```
|
||||
|
||||
The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
|
||||
|
||||
<Tip warning={true}>
|
||||
If the automated install fails, install manually:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
|
||||
pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
|
||||
|
||||
</Tip>
|
||||
|
||||
### 4. Add RoboTwin to PYTHONPATH
|
||||
|
||||
The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
||||
```
|
||||
|
||||
Add this to your shell profile to make it permanent.
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Standard evaluation (recommended)
|
||||
|
||||
Evaluate a policy on a single task with the official protocol (100 episodes):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Single-task quick check
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=5
|
||||
```
|
||||
|
||||
### Multi-task sweep
|
||||
|
||||
Evaluate on several tasks in one run:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Full benchmark (all 60 tasks)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,close_laptop,close_microwave,dump_bin,grab_roller,handover_block,handover_cup,handover_diverse_bottles,handover_mic,hanging_mug,insert_pin,lift_pot,make_tea,open_laptop,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_basket,place_block,place_cable,place_can,place_chopsticks,place_cloth,place_container,place_cup,place_diverse_bottles,place_dual_bottles,place_fork,place_knife,place_object_basket,place_ring,place_ruler,place_shoes_left,place_shoes_right,place_spoon,place_toy,pour_water,press_stapler,put_bottles_dustbin,put_object_cabinet,put_shoes_box,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch,wipe_board,arrange_tools,build_tower,fold_cloth \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
## Camera configuration
|
||||
|
||||
By default, all four cameras are included:
|
||||
|
||||
| Camera key | Description |
|
||||
| -------------- | ------------------------------ |
|
||||
| `head_camera` | Overhead / third-person view |
|
||||
| `front_camera` | Front-facing static camera |
|
||||
| `left_wrist` | Left arm wrist-mounted camera |
|
||||
| `right_wrist` | Right arm wrist-mounted camera |
|
||||
|
||||
To use a subset of cameras, override `--env.camera_names`:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--env.camera_names="head_camera,left_wrist,right_wrist" \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
## Environment config reference
|
||||
|
||||
Key parameters for `RoboTwinEnvConfig`:
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------- | --------------------------------------------------- | ---------------------------------- |
|
||||
| `task` | `"beat_block_hammer"` | Comma-separated task name(s) |
|
||||
| `fps` | `25` | Simulation FPS |
|
||||
| `episode_length` | `300` | Max steps per episode |
|
||||
| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` |
|
||||
| `camera_names` | `"head_camera,front_camera,left_wrist,right_wrist"` | Comma-separated active cameras |
|
||||
| `observation_height` | `480` | Camera pixel height |
|
||||
| `observation_width` | `640` | Camera pixel width |
|
||||
|
||||
## Leaderboard submission
|
||||
|
||||
Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
|
||||
|
||||
- Training on 50 `demo_clean` demonstrations per task
|
||||
- Evaluating 100 episodes per task
|
||||
- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
|
||||
|
||||
For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
|
||||
@@ -574,3 +574,97 @@ class IsaaclabArenaEnv(HubEnvConfig):
|
||||
),
|
||||
PolicyProcessorPipeline(steps=[]),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robotwin")
|
||||
@dataclass
|
||||
class RoboTwinEnvConfig(EnvConfig):
|
||||
"""Configuration for RoboTwin 2.0 benchmark environments.
|
||||
|
||||
RoboTwin 2.0 is a dual-arm manipulation benchmark with 60 tasks built on the
|
||||
SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
|
||||
(7 per arm). All four cameras are enabled by default.
|
||||
|
||||
See: https://robotwin-platform.github.io
|
||||
Dataset: https://huggingface.co/datasets/hxma/RoboTwin-LeRobot-v3.0
|
||||
"""
|
||||
|
||||
task: str = "beat_block_hammer" # single task or comma-separated list
|
||||
fps: int = 25
|
||||
episode_length: int = 300
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
# Comma-separated list of cameras to include in observations.
|
||||
# Available: head_camera, front_camera, left_wrist, right_wrist
|
||||
camera_names: str = "head_camera,front_camera,left_wrist,right_wrist"
|
||||
observation_height: int = 480
|
||||
observation_width: int = 640
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"pixels/head_camera": f"{OBS_IMAGES}.head_camera",
|
||||
"pixels/front_camera": f"{OBS_IMAGES}.front_camera",
|
||||
"pixels/left_wrist": f"{OBS_IMAGES}.left_wrist",
|
||||
"pixels/right_wrist": f"{OBS_IMAGES}.right_wrist",
|
||||
"agent_pos": OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
for cam in cam_list:
|
||||
self.features[f"pixels/{cam}"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
)
|
||||
# Keep features_map entry if already set (default_factory); add if missing.
|
||||
key = f"pixels/{cam}"
|
||||
if key not in self.features_map:
|
||||
self.features_map[key] = f"{OBS_IMAGES}.{cam}"
|
||||
|
||||
if self.obs_type == "pixels_agent_pos":
|
||||
self.features["agent_pos"] = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(14,), # 14 DOF: 7 per arm
|
||||
)
|
||||
elif self.obs_type != "pixels":
|
||||
raise ValueError(
|
||||
f"Unsupported obs_type '{self.obs_type}'. "
|
||||
"RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
|
||||
)
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from lerobot.envs.robotwin import create_robotwin_envs
|
||||
|
||||
if not self.task:
|
||||
raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
|
||||
|
||||
env_cls = gym.vector.AsyncVectorEnv if (use_async_envs and n_envs > 1) else gym.vector.SyncVectorEnv
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
return create_robotwin_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
env_cls=env_cls,
|
||||
camera_names=cam_list,
|
||||
observation_height=self.observation_height,
|
||||
observation_width=self.observation_width,
|
||||
episode_length=self.episode_length,
|
||||
)
|
||||
|
||||
def get_env_processors(self):
|
||||
from lerobot.processor.env_processor import RoboTwinProcessorStep
|
||||
from lerobot.processor.pipeline import PolicyProcessorPipeline
|
||||
|
||||
return (
|
||||
PolicyProcessorPipeline(steps=[RoboTwinProcessorStep()]),
|
||||
PolicyProcessorPipeline(steps=[]),
|
||||
)
|
||||
|
||||
@@ -0,0 +1,465 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
|
||||
# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
|
||||
ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
|
||||
"head_camera",
|
||||
"front_camera",
|
||||
"left_wrist",
|
||||
"right_wrist",
|
||||
)
|
||||
|
||||
ACTION_DIM = 14 # 7 DOF × 2 arms
|
||||
ACTION_LOW = -1.0
|
||||
ACTION_HIGH = 1.0
|
||||
DEFAULT_EPISODE_LENGTH = 300
|
||||
|
||||
# Complete task list from RoboTwin 2.0 (60 tasks, as listed on the leaderboard).
|
||||
ROBOTWIN_TASKS: tuple[str, ...] = (
|
||||
"adjust_bottle",
|
||||
"beat_block_hammer",
|
||||
"blocks_ranking_rgb",
|
||||
"blocks_ranking_size",
|
||||
"click_alarmclock",
|
||||
"click_bell",
|
||||
"close_laptop",
|
||||
"close_microwave",
|
||||
"dump_bin",
|
||||
"grab_roller",
|
||||
"handover_block",
|
||||
"handover_cup",
|
||||
"handover_diverse_bottles",
|
||||
"handover_mic",
|
||||
"hanging_mug",
|
||||
"insert_pin",
|
||||
"lift_pot",
|
||||
"make_tea",
|
||||
"open_laptop",
|
||||
"open_microwave",
|
||||
"pick_diverse_bottles",
|
||||
"pick_dual_bottles",
|
||||
"place_basket",
|
||||
"place_block",
|
||||
"place_cable",
|
||||
"place_can",
|
||||
"place_chopsticks",
|
||||
"place_cloth",
|
||||
"place_container",
|
||||
"place_cup",
|
||||
"place_diverse_bottles",
|
||||
"place_dual_bottles",
|
||||
"place_fork",
|
||||
"place_knife",
|
||||
"place_object_basket",
|
||||
"place_ring",
|
||||
"place_ruler",
|
||||
"place_shoes_left",
|
||||
"place_shoes_right",
|
||||
"place_spoon",
|
||||
"place_toy",
|
||||
"pour_water",
|
||||
"press_stapler",
|
||||
"put_bottles_dustbin",
|
||||
"put_object_cabinet",
|
||||
"put_shoes_box",
|
||||
"rotate_qrcode",
|
||||
"scan_object",
|
||||
"shake_bottle",
|
||||
"shake_bottle_horizontally",
|
||||
"stack_blocks_three",
|
||||
"stack_blocks_two",
|
||||
"stack_bowls_three",
|
||||
"stack_bowls_two",
|
||||
"stamp_seal",
|
||||
"turn_switch",
|
||||
"wipe_board",
|
||||
"arrange_tools",
|
||||
"build_tower",
|
||||
"fold_cloth",
|
||||
)
|
||||
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
|
||||
|
||||
|
||||
def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
|
||||
"""Build the kwargs dict RoboTwin's setup_demo expects.
|
||||
|
||||
Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
|
||||
reads ``task_config/demo_clean.yml``, resolves the embodiment file from
|
||||
``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
|
||||
reads camera dimensions from ``_camera_config.yml``.
|
||||
|
||||
Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
|
||||
used by beat_block_hammer and most smoke-test tasks).
|
||||
"""
|
||||
if task_name in _ROBOTWIN_SETUP_CACHE:
|
||||
return dict(_ROBOTWIN_SETUP_CACHE[task_name])
|
||||
|
||||
import os
|
||||
|
||||
import yaml # type: ignore[import-untyped]
|
||||
from envs import CONFIGS_PATH # type: ignore[import-not-found]
|
||||
|
||||
task_config = "demo_clean"
|
||||
with open(f"./task_config/{task_config}.yml", encoding="utf-8") as f:
|
||||
args = yaml.safe_load(f)
|
||||
|
||||
# Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
|
||||
with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
|
||||
embodiment_types = yaml.safe_load(f)
|
||||
embodiment = args.get("embodiment", ["aloha-agilex"])
|
||||
if len(embodiment) == 1:
|
||||
robot_file = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["left_robot_file"] = robot_file
|
||||
args["right_robot_file"] = robot_file
|
||||
args["dual_arm_embodied"] = True
|
||||
elif len(embodiment) == 3:
|
||||
args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
|
||||
args["embodiment_dis"] = embodiment[2]
|
||||
args["dual_arm_embodied"] = False
|
||||
else:
|
||||
raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
|
||||
|
||||
with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["left_embodiment_config"] = yaml.safe_load(f)
|
||||
with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["right_embodiment_config"] = yaml.safe_load(f)
|
||||
|
||||
# Camera dimensions
|
||||
with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
|
||||
camera_config = yaml.safe_load(f)
|
||||
head_cam = args["camera"]["head_camera_type"]
|
||||
args["head_camera_h"] = camera_config[head_cam]["h"]
|
||||
args["head_camera_w"] = camera_config[head_cam]["w"]
|
||||
|
||||
# Headless overrides
|
||||
args["render_freq"] = 0
|
||||
args["task_name"] = task_name
|
||||
args["task_config"] = task_config
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE[task_name] = args
|
||||
return dict(args)
|
||||
|
||||
|
||||
def _load_robotwin_task(task_name: str) -> type:
|
||||
"""Dynamically import and return a RoboTwin 2.0 task class.
|
||||
|
||||
RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
|
||||
root and are expected to be on ``sys.path`` after installation.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(f"envs.{task_name}")
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(
|
||||
f"Could not import RoboTwin task '{task_name}'. "
|
||||
"Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
|
||||
"See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
|
||||
) from e
|
||||
task_cls = getattr(module, task_name, None)
|
||||
if task_cls is None:
|
||||
raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
|
||||
return task_cls
|
||||
|
||||
|
||||
class RoboTwinEnv(gym.Env):
|
||||
"""Gymnasium wrapper around a single RoboTwin 2.0 task.
|
||||
|
||||
RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
|
||||
``take_action`` / ``check_success``) rather than the standard gym interface.
|
||||
This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
|
||||
RoboTwin exactly like LIBERO or Meta-World.
|
||||
|
||||
The underlying SAPIEN environment is created lazily on the first ``reset()``
|
||||
call *inside the worker process*. This is required for
|
||||
``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
|
||||
contexts that must not be forked from the parent process.
|
||||
|
||||
Observations
|
||||
------------
|
||||
The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
|
||||
``"head_camera"``, ``"left_wrist"``). ``preprocess_observation`` in
|
||||
``envs/utils.py`` then converts these to ``observation.images.<cam>``.
|
||||
|
||||
Actions
|
||||
-------
|
||||
14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task_name: str,
|
||||
episode_index: int = 0,
|
||||
n_envs: int = 1,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int = 480,
|
||||
observation_width: int = 640,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
render_mode: str = "rgb_array",
|
||||
):
|
||||
super().__init__()
|
||||
self.task_name = task_name
|
||||
self.task = task_name # used by add_envs_task() in utils.py
|
||||
self.task_description = task_name.replace("_", " ")
|
||||
self.episode_index = episode_index
|
||||
self._reset_stride = n_envs
|
||||
self.camera_names = list(camera_names)
|
||||
self.observation_height = observation_height
|
||||
self.observation_width = observation_width
|
||||
self.episode_length = episode_length
|
||||
self.render_mode = render_mode
|
||||
|
||||
self._env: Any | None = None # deferred — created on first reset() inside worker
|
||||
self._step_count: int = 0
|
||||
|
||||
image_spaces = {
|
||||
cam: spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(observation_height, observation_width, 3),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
for cam in self.camera_names
|
||||
}
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(image_spaces),
|
||||
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float64),
|
||||
}
|
||||
)
|
||||
self.action_space = spaces.Box(
|
||||
low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
|
||||
)
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the SAPIEN environment on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own EGL/GPU context rather than inheriting a stale one from the
|
||||
parent process (which causes crashes with AsyncVectorEnv).
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
task_cls = _load_robotwin_task(self.task_name)
|
||||
self._env = task_cls()
|
||||
|
||||
def _get_obs(self) -> RobotObservation:
|
||||
assert self._env is not None, "_get_obs called before _ensure_env()"
|
||||
raw = self._env.get_obs()
|
||||
|
||||
images: dict[str, np.ndarray] = {}
|
||||
for cam in self.camera_names:
|
||||
# RoboTwin 2.0 camera keys follow the "<name>_rgb" convention.
|
||||
# Fall back to the bare name if the suffixed key is absent.
|
||||
key = f"{cam}_rgb" if f"{cam}_rgb" in raw else cam
|
||||
if key in raw:
|
||||
img = np.asarray(raw[key], dtype=np.uint8)
|
||||
# Ensure HWC and exactly 3 channels
|
||||
if img.ndim == 2:
|
||||
img = np.stack([img, img, img], axis=-1)
|
||||
elif img.shape[-1] != 3:
|
||||
img = img[..., :3]
|
||||
if img.shape[0] != self.observation_height or img.shape[1] != self.observation_width:
|
||||
# Resize is best done outside hot-loop; return as-is and let
|
||||
# the processor handle it if needed.
|
||||
pass
|
||||
images[cam] = img
|
||||
else:
|
||||
# Camera not exposed by this task — return a black frame so the
|
||||
# observation shape stays consistent across all tasks.
|
||||
images[cam] = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
|
||||
|
||||
# Joint state: try common key names used by RoboTwin base_task.
|
||||
joint_state: np.ndarray | None = None
|
||||
for key in ("joint_action", "qpos", "robot_qpos", "joint_state", "obs"):
|
||||
if key in raw:
|
||||
arr = np.asarray(raw[key], dtype=np.float64).flatten()
|
||||
if arr.size >= ACTION_DIM:
|
||||
joint_state = arr[:ACTION_DIM]
|
||||
break
|
||||
if joint_state is None:
|
||||
joint_state = np.zeros(ACTION_DIM, dtype=np.float64)
|
||||
|
||||
return {"pixels": images, "agent_pos": joint_state}
|
||||
|
||||
def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
|
||||
self._ensure_env()
|
||||
super().reset(seed=seed)
|
||||
assert self._env is not None # set by _ensure_env() above
|
||||
|
||||
actual_seed = self.episode_index if seed is None else seed
|
||||
setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
|
||||
setup_kwargs.update(seed=actual_seed, is_test=True)
|
||||
self._env.setup_demo(**setup_kwargs)
|
||||
self.episode_index += self._reset_stride
|
||||
self._step_count = 0
|
||||
|
||||
obs = self._get_obs()
|
||||
return obs, {"is_success": False, "task": self.task_name}
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
assert self._env is not None, "step() called before reset()"
|
||||
if action.ndim != 1 or action.shape[0] != ACTION_DIM:
|
||||
raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
|
||||
|
||||
# RoboTwin 2.0 uses take_action(); fall back to step() for older forks.
|
||||
if hasattr(self._env, "take_action"):
|
||||
self._env.take_action(action)
|
||||
else:
|
||||
self._env.step(action)
|
||||
|
||||
self._step_count += 1
|
||||
|
||||
is_success = bool(getattr(self._env, "eval_success", False))
|
||||
if not is_success and hasattr(self._env, "check_success"):
|
||||
is_success = bool(self._env.check_success())
|
||||
|
||||
obs = self._get_obs()
|
||||
reward = float(is_success)
|
||||
terminated = is_success
|
||||
truncated = self._step_count >= self.episode_length
|
||||
|
||||
info: dict[str, Any] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
"step": self._step_count,
|
||||
}
|
||||
if terminated or truncated:
|
||||
info["final_info"] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
}
|
||||
self.reset()
|
||||
|
||||
return obs, reward, terminated, truncated, info
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
obs = self._get_obs()
|
||||
# Prefer head camera for rendering; fall back to first available.
|
||||
if "head_camera" in obs["pixels"]:
|
||||
return obs["pixels"]["head_camera"]
|
||||
return next(iter(obs["pixels"].values()))
|
||||
|
||||
def close(self) -> None:
|
||||
if self._env is not None:
|
||||
if hasattr(self._env, "close_env"):
|
||||
import contextlib
|
||||
|
||||
with contextlib.suppress(TypeError):
|
||||
self._env.close_env()
|
||||
self._env = None
|
||||
|
||||
|
||||
# ---- Multi-task factory --------------------------------------------------------
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task_name: str,
|
||||
n_envs: int,
|
||||
camera_names: list[str],
|
||||
observation_height: int,
|
||||
observation_width: int,
|
||||
episode_length: int,
|
||||
) -> list[Callable[[], RoboTwinEnv]]:
|
||||
"""Return n_envs factory callables for a single task."""
|
||||
|
||||
def _make_one(episode_index: int) -> RoboTwinEnv:
|
||||
return RoboTwinEnv(
|
||||
task_name=task_name,
|
||||
episode_index=episode_index,
|
||||
n_envs=n_envs,
|
||||
camera_names=camera_names,
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robotwin_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int = 480,
|
||||
observation_width: int = 640,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create vectorized RoboTwin 2.0 environments.
|
||||
|
||||
Returns:
|
||||
``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
|
||||
``n_envs`` parallel rollouts.
|
||||
|
||||
Args:
|
||||
task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
|
||||
or ``"beat_block_hammer,click_bell"``).
|
||||
n_envs: Number of parallel rollouts per task.
|
||||
env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
|
||||
camera_names: Cameras to include in observations.
|
||||
observation_height: Pixel height for all cameras.
|
||||
observation_width: Pixel width for all cameras.
|
||||
episode_length: Max steps before truncation.
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
task_names = [t.strip() for t in str(task).split(",") if t.strip()]
|
||||
if not task_names:
|
||||
raise ValueError("`task` must contain at least one RoboTwin task name.")
|
||||
|
||||
unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
|
||||
|
||||
print(f"Creating RoboTwin envs | tasks={task_names} | n_envs(per task)={n_envs}")
|
||||
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
for task_name in task_names:
|
||||
fns = _make_env_fns(
|
||||
task_name=task_name,
|
||||
n_envs=n_envs,
|
||||
camera_names=list(camera_names),
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
out[task_name][0] = env_cls(fns)
|
||||
print(f"Built vec env | task={task_name} | n_envs={n_envs}")
|
||||
|
||||
return {k: dict(v) for k, v in out.items()}
|
||||
@@ -226,3 +226,29 @@ class IsaaclabArenaProcessorStep(ObservationProcessorStep):
|
||||
|
||||
def observation(self, observation):
|
||||
return self._process_observation(observation)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ProcessorStepRegistry.register(name="robotwin_processor")
|
||||
class RoboTwinProcessorStep(ObservationProcessorStep):
|
||||
"""Passthrough step for RoboTwin observations, casting state to float32.
|
||||
|
||||
RoboTwin observations already arrive in LeRobot format (observation.images.*
|
||||
and observation.state), so this step mainly ensures state dtype is float32.
|
||||
"""
|
||||
|
||||
def _process_observation(self, observation):
|
||||
processed_obs = dict(observation)
|
||||
if OBS_STATE in processed_obs:
|
||||
state = processed_obs[OBS_STATE]
|
||||
if hasattr(state, "dtype") and state.dtype != torch.float32:
|
||||
processed_obs[OBS_STATE] = state.float()
|
||||
return processed_obs
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
return features
|
||||
|
||||
def observation(self, observation):
|
||||
return self._process_observation(observation)
|
||||
|
||||
@@ -0,0 +1,283 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper.
|
||||
|
||||
These tests mock out the SAPIEN-based RoboTwin runtime so they run without
|
||||
the full RoboTwin installation (SAPIEN, CuRobo, mplib, etc.).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from lerobot.envs.robotwin import (
|
||||
ACTION_DIM,
|
||||
ROBOTWIN_CAMERA_NAMES,
|
||||
ROBOTWIN_TASKS,
|
||||
RoboTwinEnv,
|
||||
create_robotwin_envs,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures / helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_mock_task_env(
|
||||
height: int = 480,
|
||||
width: int = 640,
|
||||
cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES,
|
||||
) -> MagicMock:
|
||||
"""Return a mock that mimics the RoboTwin task class API."""
|
||||
obs_dict = {f"{cam}_rgb": np.zeros((height, width, 3), dtype=np.uint8) for cam in cameras}
|
||||
obs_dict["joint_action"] = np.zeros(ACTION_DIM, dtype=np.float64)
|
||||
|
||||
mock = MagicMock()
|
||||
mock.get_obs.return_value = obs_dict
|
||||
mock.setup_demo.return_value = None
|
||||
mock.take_action.return_value = None
|
||||
mock.eval_success = False
|
||||
mock.check_success.return_value = False
|
||||
mock.close_env.return_value = None
|
||||
return mock
|
||||
|
||||
|
||||
def _patch_load(mock_task_instance: MagicMock):
|
||||
"""Context manager that patches _load_robotwin_task to return a mock class."""
|
||||
task_cls = MagicMock(return_value=mock_task_instance)
|
||||
return patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RoboTwinEnv unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRoboTwinEnv:
|
||||
def test_observation_space_shape(self):
|
||||
"""observation_space should have the configured h×w×3 for every camera."""
|
||||
h, w = 240, 320
|
||||
env = RoboTwinEnv(
|
||||
task_name="beat_block_hammer",
|
||||
observation_height=h,
|
||||
observation_width=w,
|
||||
camera_names=["head_camera", "front_camera"],
|
||||
)
|
||||
pixels_space = env.observation_space["pixels"]
|
||||
assert pixels_space["head_camera"].shape == (h, w, 3)
|
||||
assert pixels_space["front_camera"].shape == (h, w, 3)
|
||||
assert "left_wrist" not in pixels_space
|
||||
|
||||
def test_action_space(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env.action_space.shape == (ACTION_DIM,)
|
||||
assert env.action_space.dtype == np.float32
|
||||
|
||||
def test_reset_returns_correct_obs_keys(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_load(mock_task):
|
||||
obs, info = env.reset()
|
||||
|
||||
assert "pixels" in obs
|
||||
for cam in ROBOTWIN_CAMERA_NAMES:
|
||||
assert cam in obs["pixels"], f"Missing camera '{cam}' in obs"
|
||||
assert "agent_pos" in obs
|
||||
assert obs["agent_pos"].shape == (ACTION_DIM,)
|
||||
assert info["is_success"] is False
|
||||
|
||||
def test_reset_calls_setup_demo(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_load(mock_task):
|
||||
env.reset(seed=42)
|
||||
mock_task.setup_demo.assert_called_once_with(seed=42, is_test=True)
|
||||
|
||||
def test_step_returns_correct_types(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_load(mock_task):
|
||||
env.reset()
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
assert isinstance(obs, dict)
|
||||
assert isinstance(reward, float)
|
||||
assert isinstance(terminated, bool)
|
||||
assert isinstance(truncated, bool)
|
||||
assert isinstance(info, dict)
|
||||
|
||||
def test_step_wrong_action_shape_raises(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
bad_action = np.zeros(7, dtype=np.float32) # wrong dim
|
||||
with _patch_load(mock_task):
|
||||
env.reset()
|
||||
with pytest.raises(ValueError, match="Expected 1-D action"):
|
||||
env.step(bad_action)
|
||||
|
||||
def test_success_terminates_episode(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
mock_task.check_success.return_value = True # always succeeds
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_load(mock_task):
|
||||
env.reset()
|
||||
_, _, terminated, _, info = env.step(action)
|
||||
assert terminated is True
|
||||
assert info["is_success"] is True
|
||||
|
||||
def test_truncation_after_episode_length(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2)
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_load(mock_task):
|
||||
env.reset()
|
||||
env.step(action) # step 1
|
||||
_, _, _, truncated, _ = env.step(action) # step 2 → truncated
|
||||
assert truncated is True
|
||||
|
||||
def test_close_calls_close_env(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_load(mock_task):
|
||||
env.reset()
|
||||
env.close()
|
||||
mock_task.close_env.assert_called_once_with(clearance=False)
|
||||
|
||||
def test_black_frame_for_missing_camera(self):
|
||||
"""If a camera key is absent from get_obs(), a black frame is returned."""
|
||||
mock_task = _make_mock_task_env(cameras=("head_camera",)) # only head_camera
|
||||
env = RoboTwinEnv(
|
||||
task_name="beat_block_hammer",
|
||||
camera_names=["head_camera", "front_camera"],
|
||||
observation_height=10,
|
||||
observation_width=10,
|
||||
)
|
||||
with _patch_load(mock_task):
|
||||
obs, _ = env.reset()
|
||||
# front_camera is missing from mock → should be a black 10×10×3 array
|
||||
assert obs["pixels"]["front_camera"].shape == (10, 10, 3)
|
||||
assert obs["pixels"]["front_camera"].sum() == 0
|
||||
|
||||
def test_task_and_task_description_attributes(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env.task == "beat_block_hammer"
|
||||
assert isinstance(env.task_description, str)
|
||||
|
||||
def test_deferred_init_env_is_none_before_reset(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env._env is None # noqa: SLF001 (testing internal state)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# create_robotwin_envs tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCreateRoboTwinEnvs:
|
||||
def test_returns_correct_structure(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env_cls = gym.vector.SyncVectorEnv
|
||||
with _patch_load(mock_task):
|
||||
envs = create_robotwin_envs(
|
||||
task="beat_block_hammer",
|
||||
n_envs=1,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
assert "beat_block_hammer" in envs
|
||||
assert 0 in envs["beat_block_hammer"]
|
||||
assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv)
|
||||
|
||||
def test_multi_task(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env_cls = gym.vector.SyncVectorEnv
|
||||
with _patch_load(mock_task):
|
||||
envs = create_robotwin_envs(
|
||||
task="beat_block_hammer,click_bell",
|
||||
n_envs=1,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
assert set(envs.keys()) == {"beat_block_hammer", "click_bell"}
|
||||
|
||||
def test_unknown_task_raises(self):
|
||||
with pytest.raises(ValueError, match="Unknown RoboTwin tasks"):
|
||||
create_robotwin_envs(
|
||||
task="not_a_real_task",
|
||||
n_envs=1,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
|
||||
def test_invalid_n_envs_raises(self):
|
||||
with pytest.raises(ValueError, match="n_envs must be a positive int"):
|
||||
create_robotwin_envs(
|
||||
task="beat_block_hammer",
|
||||
n_envs=0,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ROBOTWIN_TASKS list
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_task_list_not_empty():
|
||||
assert len(ROBOTWIN_TASKS) >= 60
|
||||
|
||||
|
||||
def test_all_tasks_are_strings():
|
||||
assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS)
|
||||
|
||||
|
||||
def test_no_duplicate_tasks():
|
||||
assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RoboTwinProcessorStep
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRoboTwinProcessorStep:
|
||||
def test_passes_through_images_and_state(self):
|
||||
from lerobot.processor.env_processor import RoboTwinProcessorStep
|
||||
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
|
||||
|
||||
step = RoboTwinProcessorStep()
|
||||
obs = {
|
||||
f"{OBS_IMAGES}.head_camera": torch.zeros(1, 3, 4, 4),
|
||||
f"{OBS_IMAGES}.front_camera": torch.zeros(1, 3, 4, 4),
|
||||
OBS_STATE: torch.zeros(1, 14),
|
||||
}
|
||||
result = step.observation(obs)
|
||||
assert f"{OBS_IMAGES}.head_camera" in result
|
||||
assert f"{OBS_IMAGES}.front_camera" in result
|
||||
assert result[OBS_STATE].dtype == torch.float32
|
||||
|
||||
def test_state_cast_to_float32(self):
|
||||
from lerobot.processor.env_processor import RoboTwinProcessorStep
|
||||
from lerobot.utils.constants import OBS_STATE
|
||||
|
||||
step = RoboTwinProcessorStep()
|
||||
obs = {OBS_STATE: torch.zeros(1, 14, dtype=torch.float64)}
|
||||
result = step.observation(obs)
|
||||
assert result[OBS_STATE].dtype == torch.float32
|
||||
Reference in New Issue
Block a user