diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 79d5614b2..b07c8f8da 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -83,10 +83,13 @@ jobs: cache-binary: false - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] with: username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} # Build the benchmark-specific image. The Dockerfile separates dep-install # from source-copy, so code-only changes skip the slow uv-sync layer @@ -115,7 +118,7 @@ jobs: bash -c " hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true lerobot-eval \ - --policy.path=pepijn223/smolvla_libero \ + --policy.path=lerobot/smolvla_libero \ --env.type=libero \ --env.task=libero_spatial \ --eval.batch_size=1 \ @@ -144,7 +147,7 @@ jobs: --artifacts-dir /tmp/libero-artifacts \ --env libero \ --task libero_spatial \ - --policy pepijn223/smolvla_libero + --policy lerobot/smolvla_libero - name: Upload Libero rollout video if: always() @@ -238,10 +241,13 @@ jobs: cache-binary: false - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] with: username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} - name: Build MetaWorld benchmark image uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] @@ -264,7 +270,7 @@ jobs: bash -c " hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true lerobot-eval \ - --policy.path=pepijn223/smolvla_metaworld \ + --policy.path=lerobot/smolvla_metaworld \ --env.type=metaworld \ --env.task=metaworld-push-v3 \ --eval.batch_size=1 \ @@ -293,7 +299,7 @@ jobs: --artifacts-dir /tmp/metaworld-artifacts \ --env metaworld \ --task metaworld-push-v3 \ - --policy pepijn223/smolvla_metaworld + --policy lerobot/smolvla_metaworld - name: Upload MetaWorld rollout video if: always() @@ -310,3 +316,630 @@ jobs: name: metaworld-metrics path: /tmp/metaworld-artifacts/metrics.json if-no-files-found: warn + + # ── ROBOTWIN 2.0 ────────────────────────────────────────────────────────── + # Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo, + # pytorch3d, + simulation assets (~4 GB). + # Build takes ~20 min on first run; subsequent runs hit the layer cache. + # Requires an NVIDIA GPU runner with CUDA 12.1 drivers. + robotwin-integration-test: + name: RoboTwin 2.0 — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + ROBOTWIN_POLICY: lerobot/smolvla_robotwin + ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + # Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d + + # simulation assets (~4 GB). Layer cache lives in the runner's local + # Docker daemon — reused across re-runs on the same machine. + - name: Build RoboTwin 2.0 benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robotwin + push: false + load: true + tags: lerobot-benchmark-robotwin:ci + cache-from: type=local,src=/tmp/.buildx-cache-robotwin + cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max + + - name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + # Named container (no --rm) so we can docker cp artifacts out. + docker run --name robotwin-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \ + -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \ + lerobot-benchmark-robotwin:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + cd /opt/robotwin && lerobot-eval \ + --policy.path=\"\$ROBOTWIN_POLICY\" \ + --env.type=robotwin \ + --env.task=\"\$ROBOTWIN_TASKS\" \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python /lerobot/scripts/ci/extract_task_descriptions.py \ + --env robotwin \ + --task \"\$ROBOTWIN_TASKS\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboTwin artifacts from container + if: always() + run: | + mkdir -p /tmp/robotwin-artifacts + docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true + docker rm -f robotwin-eval || true + + - name: Parse RoboTwin eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robotwin-artifacts \ + --env robotwin \ + --task "${ROBOTWIN_TASKS}" \ + --policy "${ROBOTWIN_POLICY}" + + - name: Upload RoboTwin rollout video + if: always() + uses: actions/upload-artifact@v4 + with: + name: robotwin-rollout-video + path: /tmp/robotwin-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboTwin eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: robotwin-metrics + path: /tmp/robotwin-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOCASA365 ────────────────────────────────────────────────────────── + # Isolated image: robocasa + robosuite installed manually as editable + # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins + # `lerobot==0.3.3`, which would shadow this repo's lerobot). + robocasa-integration-test: + name: RoboCasa365 — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCasa365 benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocasa + push: false + load: true + tags: lerobot-benchmark-robocasa:ci + + - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocasa-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e MUJOCO_GL=egl \ + lerobot-benchmark-robocasa:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robocasa \ + --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCasa365 artifacts from container + if: always() + run: | + mkdir -p /tmp/robocasa-artifacts + docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true + docker rm -f robocasa-eval || true + + - name: Parse RoboCasa365 eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocasa-artifacts \ + --env robocasa \ + --task atomic_smoke_10 \ + --policy lerobot/smolvla_robocasa + + - name: Upload RoboCasa365 rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-rollout-video + path: /tmp/robocasa-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCasa365 eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-metrics + path: /tmp/robocasa-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOCEREBRA ─────────────────────────────────────────────────────────── + # Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera + # defaults (image/wrist_image). The image is layered on + # huggingface/lerobot-gpu, which already ships [libero] as part of [all]. + robocerebra-integration-test: + name: RoboCerebra — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCerebra benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocerebra + push: false + load: true + tags: lerobot-benchmark-robocerebra:ci + cache-from: type=local,src=/tmp/.buildx-cache-robocerebra + cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max + + - name: Run RoboCerebra smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocerebra-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e LIBERO_DATA_FOLDER=/tmp/libero_data \ + lerobot-benchmark-robocerebra:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero --task libero_10 \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCerebra artifacts from container + if: always() + run: | + mkdir -p /tmp/robocerebra-artifacts + docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true + docker rm -f robocerebra-eval || true + + - name: Parse RoboCerebra eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocerebra-artifacts \ + --env robocerebra \ + --task libero_10 \ + --policy lerobot/smolvla_robocerebra + + - name: Upload RoboCerebra rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-rollout-video + path: /tmp/robocerebra-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCerebra eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-metrics + path: /tmp/robocerebra-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOMME ─────────────────────────────────────────────────────────────── + # Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy + # overrides (robomme can't be a pyproject extra due to numpy<2 pin). + robomme-integration-test: + name: RoboMME — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + ROBOMME_POLICY: lerobot/smolvla_robomme + ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboMME benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robomme + push: false + load: true + tags: lerobot-benchmark-robomme:ci + + - name: Run RoboMME smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robomme-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e ROBOMME_POLICY="${ROBOMME_POLICY}" \ + -e ROBOMME_TASKS="${ROBOMME_TASKS}" \ + lerobot-benchmark-robomme:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=\"\$ROBOMME_POLICY\" \ + --env.type=robomme \ + --env.task=\"\$ROBOMME_TASKS\" \ + --env.dataset_split=test \ + --env.task_ids=[0] \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=3 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robomme --task \"\$ROBOMME_TASKS\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboMME artifacts from container + if: always() + run: | + mkdir -p /tmp/robomme-artifacts + docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true + docker rm -f robomme-eval || true + + - name: Parse RoboMME eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robomme-artifacts \ + --env robomme \ + --task "${ROBOMME_TASKS}" \ + --policy "${ROBOMME_POLICY}" + + - name: Upload RoboMME rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-rollout-video + path: /tmp/robomme-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboMME eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-metrics + path: /tmp/robomme-artifacts/metrics.json + if-no-files-found: warn + + # ── LIBERO-plus ─────────────────────────────────────────────────────────── + # Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of + # huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus). + libero-plus-integration-test: + name: LIBERO-plus — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + LIBERO_PLUS_SUITE: libero_spatial + LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus + LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]" + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build LIBERO-plus benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero_plus + push: false + load: true + tags: lerobot-benchmark-libero-plus:ci + cache-from: type=local,src=/tmp/.buildx-cache-libero-plus + cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max + + - name: Run LIBERO-plus smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name libero-plus-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \ + -e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \ + -e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \ + lerobot-benchmark-libero-plus:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=\"\$LIBERO_PLUS_POLICY\" \ + --env.type=libero_plus \ + --env.task=\"\$LIBERO_PLUS_SUITE\" \ + --env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy LIBERO-plus artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-plus-artifacts + docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true + docker rm -f libero-plus-eval || true + + - name: Parse LIBERO-plus eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-plus-artifacts \ + --env libero_plus \ + --task "${LIBERO_PLUS_SUITE}" \ + --policy "${LIBERO_PLUS_POLICY}" + + - name: Upload LIBERO-plus rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-rollout-video + path: /tmp/libero-plus-artifacts/videos/ + if-no-files-found: warn + + - name: Upload LIBERO-plus eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-metrics + path: /tmp/libero-plus-artifacts/metrics.json + if-no-files-found: warn + + # ── VLABENCH ───────────────────────────────────────────────────────────── + # Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain) + vlabench-integration-test: + name: VLABench — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build VLABench benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.vlabench + push: false + load: true + tags: lerobot-benchmark-vlabench:ci + build-args: | + VLABENCH_ASSETS_REPO=lerobot/vlabench-assets + + - name: Run VLABench smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name vlabench-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e MUJOCO_GL=egl \ + lerobot-benchmark-vlabench:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env vlabench \ + --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy VLABench artifacts from container + if: always() + run: | + mkdir -p /tmp/vlabench-artifacts + docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true + docker rm -f vlabench-eval || true + + - name: Parse VLABench eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/vlabench-artifacts \ + --env vlabench \ + --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --policy lerobot/smolvla_vlabench + + - name: Upload VLABench rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: vlabench-rollout-video + path: /tmp/vlabench-artifacts/videos/ + if-no-files-found: warn + + - name: Upload VLABench eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: vlabench-metrics + path: /tmp/vlabench-artifacts/metrics.json + if-no-files-found: warn diff --git a/AGENTS.md b/AGENTS.md index c1aba7471..bd1bf0af1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,5 +1,7 @@ This file provides guidance to AI agents when working with code in this repository. +> **User-facing help → [`AGENT_GUIDE.md`](./AGENT_GUIDE.md)** (SO-101 setup, recording, picking a policy, training duration, eval — with copy-pasteable commands). + ## Project Overview LeRobot is a PyTorch-based library for real-world robotics, providing datasets, pretrained policies, and tools for training, evaluation, data collection, and robot control. It integrates with Hugging Face Hub for model/dataset sharing. diff --git a/AGENT_GUIDE.md b/AGENT_GUIDE.md new file mode 100644 index 000000000..725948dc9 --- /dev/null +++ b/AGENT_GUIDE.md @@ -0,0 +1,410 @@ +# AGENT_GUIDE.md — LeRobot Helper for AI Agents & Users + +This file is a practical, copy-paste-friendly companion for any AI agent (Cursor, Claude, ChatGPT, Codex, etc.) helping a user work with LeRobot. It complements [`AGENTS.md`](./AGENTS.md) (dev/contributor context) with **user-facing guidance**: how to start, what to train, how long, how to record, and how to calibrate an SO-101. + +--- + +## 1. Start here — ask the user first (MANDATORY) + +Before suggesting any command, an agent MUST ask the user at least these questions and wait for answers: + +1. **What's your goal?** (e.g. "teach my SO-101 to fold a cloth", "train a policy on an existing HF dataset", "contribute a PR", "understand the codebase") +2. **What hardware do you have?** + - Robot: none / SO-100 / SO-101 / Koch / LeKiwi / Reachy / other + - Teleop: leader arm / phone / keyboard / gamepad / none + - Cameras: how many, resolution, fixed or moving? +3. **What machine will you train on?** + - GPU model + VRAM (e.g. "laptop 3060 6 GB", "RTX 4090 24 GB", "A100 80 GB", "CPU only") + - OS: macOS / Linux / Windows +4. **Skill level & time budget?** First time, some ML, experienced? Hours, days, a weekend? +5. **Do you already have a dataset?** Yes (HF repo id?) / no / want to record one +6. **How can I help right now?** (pick one concrete next step) + +Only after you have answers, propose a concrete path. If something is ambiguous, ask again rather than guessing. Bias toward **the simplest thing that works** for the user's hardware and goal. + +--- + +## 2. LeRobot in 60 seconds + +LeRobot = **datasets + policies + envs + robot control**, unified by a small set of strong abstractions. + +- **`LeRobotDataset`** — episode-aware dataset (video or images + actions + state), loadable from the Hub or disk. +- **Policies** (`ACT`, `Diffusion`, `SmolVLA`, `π0`, `π0.5`, `Wall-X`, `X-VLA`, `VQ-BeT`, `TD-MPC`, …) — all inherit `PreTrainedPolicy` and can be pushed/pulled from the Hub. +- **Processors** — small composable transforms between dataset → policy → robot. +- **Envs** (sim) and **Robots** (real) — same action/observation contract so code swaps cleanly. +- **CLI** — `lerobot-record`, `lerobot-train`, `lerobot-eval`, `lerobot-teleoperate`, `lerobot-calibrate`, `lerobot-find-port`, `lerobot-setup-motors`, `lerobot-replay`. + +See [`AGENTS.md`](./AGENTS.md) for repo architecture. + +--- + +## 3. Quickstart paths (pick one) + +### Path A — "I have an SO-101 and want my first trained policy" + +Go to §4 (SO-101 end-to-end), then §5 (data tips), then §6 (pick a policy — likely **ACT**), then §7 (how long), then §8 (eval). + +### Path B — "No hardware, I want to train on an existing dataset" + +Skip §4. Pick a policy in §6, pick a duration in §7, then run `lerobot-train` per §4.9 with a Hub `--dataset.repo_id` and an `--env.type` for eval. Finish with §8. + +### Path C — "I just want to understand the codebase" + +Read §2 above, then `AGENTS.md` "Architecture", then open `src/lerobot/policies/act/` and `src/lerobot/datasets/lerobot_dataset.py` as canonical examples. + +--- + +## 4. SO-101 end-to-end cheat-sheet + +Full details in [`docs/source/so101.mdx`](./docs/source/so101.mdx) and [`docs/source/il_robots.mdx`](./docs/source/il_robots.mdx). Minimum commands in order. Confirm arms are assembled + powered before issuing. + +**4.1 Install** + +```bash +pip install 'lerobot[feetech]' # SO-100/SO-101 motor stack +# pip install 'lerobot[all]' # everything +# pip install 'lerobot[aloha,pusht]' # specific features +# pip install 'lerobot[smolvla]' # add SmolVLA deps +git lfs install && git lfs pull +hf auth login # required to push datasets/policies +``` + +Contributors can alternatively use `uv sync --locked --extra feetech` (see `AGENTS.md`). + +**4.2 Find USB ports** — run once per arm, unplug when prompted. + +```bash +lerobot-find-port +``` + +macOS: `/dev/tty.usbmodem...`; Linux: `/dev/ttyACM0` (may need `sudo chmod 666 /dev/ttyACM0`). + +**4.3 Setup motor IDs & baudrate** (one-time, per arm) + +```bash +lerobot-setup-motors --robot.type=so101_follower --robot.port= +lerobot-setup-motors --teleop.type=so101_leader --teleop.port= +``` + +**4.4 Calibrate** — center all joints, press Enter, sweep each joint through its full range. The `id` is the calibration key — reuse it everywhere. + +```bash +lerobot-calibrate --robot.type=so101_follower --robot.port= --robot.id=my_follower +lerobot-calibrate --teleop.type=so101_leader --teleop.port= --teleop.id=my_leader +``` + +**4.5 Teleoperate** (sanity check, no recording) + +```bash +lerobot-teleoperate \ + --robot.type=so101_follower --robot.port= --robot.id=my_follower \ + --teleop.type=so101_leader --teleop.port= --teleop.id=my_leader \ + --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \ + --display_data=true +``` + +> **Feetech timeout / comms error on SO-100 / SO-101?** Before touching software, check the **red motor LEDs** on the daisy chain. +> +> - **All steady red, gripper → base chain** → wiring OK. +> - **One or more motors dark / chain stops mid-way** → wiring issue: reseat the 3-pin cables, check the controller-board power supply, and make sure each motor is fully clicked in. +> - **LEDs blinking** → the motor is in an **error state**: usually overload (forcing a joint past its limit) **or wrong power supply voltage**. SO-100 / SO-101 ship in two variants — a **5 V / 7.4 V** build and a **12 V** build — they are NOT interchangeable. Using a 12 V PSU on a 5 V / 7.4 V arm (or vice-versa) will trip this error; confirm your motor variant before powering up. +> +> Most "timeout" errors are physical, not code. + +**4.6 Record a dataset** — keys: **→** next, **←** redo, **ESC** finish & upload. + +```bash +HF_USER=$(NO_COLOR=1 hf auth whoami | awk -F': *' 'NR==1 {print $2}') + +lerobot-record \ + --robot.type=so101_follower --robot.port= --robot.id=my_follower \ + --teleop.type=so101_leader --teleop.port= --teleop.id=my_leader \ + --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \ + --dataset.repo_id=${HF_USER}/my_task \ + --dataset.single_task="" \ + --dataset.num_episodes=50 \ + --dataset.episode_time_s=30 \ + --dataset.reset_time_s=10 \ + --display_data=true +``` + +**4.7 Visualize** — **always** do this before training. Look for missing frames, camera blur, unreachable targets, inconsistent object positions. +After upload: https://huggingface.co/spaces/lerobot/visualize_dataset → paste `${HF_USER}/my_task`. Works for **any LeRobot-formatted Hub dataset** — use it to scout other datasets, inspect episode quality, or debug your own data before retraining. + +**4.8 Replay an episode** (sanity check) + +```bash +lerobot-replay --robot.type=so101_follower --robot.port= --robot.id=my_follower \ + --dataset.repo_id=${HF_USER}/my_task --dataset.episode=0 +``` + +**4.9 Train** (default: ACT — fastest, lowest memory). Apple silicon: `--policy.device=mps`. See §6/§7 for policy and duration. + +```bash +lerobot-train \ + --dataset.repo_id=${HF_USER}/my_task \ + --policy.type=act \ + --policy.device=cuda \ + --output_dir=outputs/train/act_my_task \ + --job_name=act_my_task \ + --batch_size=8 \ + --wandb.enable=true \ + --policy.repo_id=${HF_USER}/act_my_task +``` + +**4.10 Evaluate on the real robot** — compare success rate to a teleoperated baseline. + +```bash +lerobot-record \ + --robot.type=so101_follower --robot.port= --robot.id=my_follower \ + --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \ + --dataset.repo_id=${HF_USER}/eval_my_task \ + --dataset.single_task="" \ + --dataset.num_episodes=10 \ + --policy.path=${HF_USER}/act_my_task +``` + +--- + +## 5. Data collection tips (beginner → reliable policy) + +Good data beats clever models. Adopt these defaults and deviate only with evidence. + +### 5.1 Setup & ergonomics + +- **Fix the rig and cameras** before touching the software. If the rig vibrates or the operator gets frustrated, fix that first — more bad data won't help. +- **Lighting matters more than resolution.** Diffuse, consistent light. Avoid moving shadows. +- **"Can you do the task from the camera view alone?"** If no, your cameras are wrong. Fix before recording. +- Enable **action interpolation** for rollouts when available for smoother trajectories. + +### 5.2 Practice before you record + +- Do 5–10 demos without recording. Build a deliberate, repeatable strategy. +- Hesitant or inconsistent demos teach the model hesitation. + +### 5.3 Quality over speed + +Deliberate, high-quality execution beats fast sloppy runs. Optimize for speed only **after** strategy is dialed in — never trade quality for it. + +### 5.4 Consistency within and across episodes + +Same grasp, approach vector, and timing. Coherent strategies are much easier to learn than wildly varying movements. + +### 5.5 Start small, then extend (the golden rule) + +- **First 50 episodes = constrained version** of the task: one object, fixed position, fixed camera setup, one operator. +- Train a quick ACT model. See what fails. +- **Then add diversity** along one axis at a time: more positions → more lighting → more objects → more operators. +- Don't try to collect the "perfect dataset" on day one. Iterate. + +### 5.6 Policy choice for beginners + +- **Laptop / first time / want results fast → ACT.** Works surprisingly well, trains fast even on a laptop GPU. +- **Bigger GPU / language-conditioned / multi-task → SmolVLA.** Unfreezing the vision encoder (see §7) is a big win here. +- Defer π0 / π0.5 / Wall-X / X-VLA until you have a proven ACT baseline and a 20+ GB GPU. + +### 5.7 Recommended defaults for your first task + +| Setting | Value | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| Episodes | **50** to start, scale to 100–300 after first training | +| Episode length | 20–45 s (shorter is fine for grasp/place) | +| Reset time | 10 s | +| FPS | 30 | +| Cameras | **2 cameras recommended**: 1 fixed front + 1 wrist. Multi-view often outperforms single-view. A single fixed camera also works to keep things simple. | +| Task description | Short, specific, action-phrased sentence | + +### 5.8 Troubleshooting signal + +- Policy fails at one specific stage → record 10–20 more episodes **targeting that stage**. +- Policy flaps / oscillates → likely inconsistent demos, or need more training; re-record worst episodes (use **←** to redo). +- Policy ignores the object → camera framing or lighting issue, not a model issue. + +See also: [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets#what-makes-a-good-dataset). + +--- + +## 6. Which policy should I train? + +Match the policy to the user's **GPU memory** and **time budget**. Numbers below come from an internal profiling run (one training update per policy). They are **indicative only** — see caveats. + +### 6.1 Profiling snapshot (indicative) + +All policies typically train for **5–10 epochs** (see §7). + +| Policy | Batch | Update (ms) | Peak GPU mem (GB) | Best for | +| ----------- | ----: | ----------: | ----------------: | ------------------------------------------------------------------------------------------------ | +| `act` | 4 | **83.9** | **0.94** | First-time users, laptops, single-task. Fast and reliable. | +| `diffusion` | 4 | 168.6 | 4.94 | Multi-modal action distributions; needs mid-range GPU. | +| `smolvla` | 1 | 357.8 | 3.93 | Language-conditioned, multi-task, small VLA. **Unfreeze vision encoder for big gains** (see §7). | +| `xvla` | 1 | 731.6 | 15.52 | Large VLA, multi-task. | +| `wall_x` | 1 | 716.5 | 15.95 | Large VLA with world-model objective. | +| `pi0` | 1 | 940.3 | 15.50 | Strong large VLA baseline (Physical Intelligence). | +| `pi05` | 1 | 1055.8 | 16.35 | Newer π policy; similar footprint to `pi0`. | + +**Critical caveats:** + +- **Optimizer:** measured with **SGD**. LeRobot's default is **AdamW**, which keeps extra optimizer state → **peak memory will be noticeably higher** with the default, especially for `pi0`, `pi05`, `wall_x`, `xvla`. +- **Batch size:** the large policies were profiled at batch 1. In practice use a **larger batch** for stable training (see §7.4). Memory scales roughly linearly with batch. + +### 6.2 Decision rules + +- **< 8 GB VRAM (laptop, 3060, M-series Mac):** → `act`. Maybe `diffusion` if you have ~6–8 GB free. +- **12–16 GB VRAM (4070/4080, A4000):** → `smolvla` with defaults, or `act`/`diffusion` with larger batch. `pi0`/`pi05`/`wall_x`/`xvla` feasible only with small batch + gradient accumulation. +- **24+ GB VRAM (3090/4090/A5000):** → any policy. Prefer `smolvla` (unfrozen) for multi-task; `act` for single-task grasp-and-place (still often the best ROI). Could experiment with `pi0` or `pi05` or `xvla` +- **80 GB (A100/H100):** → any, with healthy batch. `pi05`, `xvla`, `wall_x` become comfortable. +- **CPU only:** → don't train here. Use Google Colab (see [`docs/source/notebooks.mdx`](./docs/source/notebooks.mdx)) or a rented GPU. + +--- + +## 7. How long should I train? + +Robotics imitation learning usually converges in a **few epochs over the dataset**, not hundreds of thousands of raw steps. Think **epochs first**, then translate to steps. + +### 7.1 Rule of thumb + +- **Typical total: 5–10 epochs.** Start at 5, eval, then decide if more helps. +- Very small datasets (< 30 episodes) may want slightly more epochs — but first, **collect more data**. +- VLAs with a pretrained vision backbone typically need **fewer** epochs than training from scratch. + +### 7.2 Steps ↔ epochs conversion + +``` +total_frames = sum of frames over all episodes # e.g. 50 eps × 30 fps × 30 s ≈ 45,000 +steps_per_epoch = ceil(total_frames / batch_size) +total_steps = epochs × steps_per_epoch +``` + +Examples for `--batch_size=8`: + +| Dataset size | Frames | Steps / epoch | 5 epochs | 10 epochs | +| ----------------------- | ------: | ------------: | -------: | --------: | +| 50 eps × 30 s @ 30 fps | 45,000 | ~5,625 | 28k | 56k | +| 100 eps × 30 s @ 30 fps | 90,000 | ~11,250 | 56k | 113k | +| 300 eps × 30 s @ 30 fps | 270,000 | ~33,750 | 169k | 338k | + +Pass the resulting total with `--steps=`; eval at intermediate checkpoints (`outputs/train/.../checkpoints/`). + +### 7.3 Per-policy starting points (single-task, ~50 episodes) + +| Policy | Batch | Steps (first run) | Notes | +| -------------- | ----: | ----------------: | ----------------------------------------------------------------- | +| `act` | 8–16 | 30k–80k | Usually converges under 50k for single-task. | +| `diffusion` | 8–16 | 80k–150k | Benefits from longer training than ACT. | +| `smolvla` | 4–8 | 30k–80k | Pretrained VLM → converges fast. | +| `pi0` / `pi05` | 1–4 | 30k–80k | Memory-bound; use gradient accumulation for effective batch ≥ 16! | + +### 7.4 Batch size guidance + +- **Bigger batch is preferable** for stable gradients on teleop data. +- If GPU memory is the bottleneck, use **gradient accumulation** to raise _effective_ batch without raising peak memory. +- Scale **learning rate** gently with batch; most LeRobot defaults work fine for a 2–4× batch change. + +### 7.5 Scale LR schedule & checkpoints with `--steps` + +LeRobot's default schedulers (e.g. SmolVLA's cosine decay) use `scheduler_decay_steps=30_000`, which is sized for long training runs. When you shorten training (e.g. 5k–10k steps on a small dataset), **scale the scheduler down to match** — otherwise the LR stays near the peak and never decays. Same for checkpoint frequency. + +```bash +lerobot-train ... \ + --steps=5000 \ + --policy.scheduler_decay_steps=5000 \ + --save_freq=5000 +``` + +Rule of thumb: set `scheduler_decay_steps ≈ steps`, and `save_freq` to whatever granularity you want for eval (e.g. every 1k–5k steps). Match `scheduler_warmup_steps` proportionally if your run is very short. + +### 7.6 SmolVLA: unfreeze the vision encoder for real gains + +SmolVLA ships with `freeze_vision_encoder=True`. Unfreezing usually **improves performance substantially** on specialized tasks, at the cost of more VRAM and slower steps. Enable with: + +```bash +lerobot-train ... --policy.type=smolvla \ + --policy.freeze_vision_encoder=false \ + --policy.train_expert_only=false +``` + +### 7.7 Signals to stop / keep going + +- Train loss plateaus → stop, save a Hub checkpoint. +- Train loss still dropping and you're under 10 epochs → keep going. + +--- + +## 8. Evaluation & benchmarks + +Two flavors of evaluation: + +### 8.1 Real-robot eval (SO-101, etc.) + +Reuse `lerobot-record` with `--policy.path` to run the trained policy on-robot and save the run as an eval dataset. Convention: prefix the dataset with `eval_`. + +```bash +lerobot-record \ + --robot.type=so101_follower --robot.port= --robot.id=my_follower \ + --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \ + --dataset.repo_id=${HF_USER}/eval_my_task \ + --dataset.single_task="" \ + --dataset.num_episodes=10 \ + --policy.path=${HF_USER}/act_my_task +``` + +Report success rate across episodes. Compare to a teleoperated baseline and to an earlier checkpoint to catch regressions. + +### 8.2 Sim-benchmark eval + +For policies trained on sim datasets (PushT, Aloha, LIBERO, MetaWorld, RoboCasa, …) use `lerobot-eval` against the matching `env.type`: + +```bash +lerobot-eval \ + --policy.path=${HF_USER}/diffusion_pusht \ + --env.type=pusht \ + --eval.n_episodes=50 \ + --eval.batch_size=10 \ + --policy.device=cuda +``` + +- Use `--policy.path=outputs/train/.../checkpoints//pretrained_model` for local checkpoints. +- `--eval.n_episodes` should be ≥ 50 for a stable success-rate estimate. +- Available envs live in `src/lerobot/envs/`. See [`docs/source/libero.mdx`](./docs/source/libero.mdx), [`metaworld.mdx`](./docs/source/metaworld.mdx), [`robocasa.mdx`](./docs/source/robocasa.mdx), [`vlabench.mdx`](./docs/source/vlabench.mdx) for specific benchmarks. +- To add a new benchmark, see [`docs/source/adding_benchmarks.mdx`](./docs/source/adding_benchmarks.mdx) and [`envhub.mdx`](./docs/source/envhub.mdx). + +### 8.2b Dockerfiles for benchmark eval + +Benchmark envs have native dependencies that are painful to install locally. The repo ships **pre-baked Dockerfiles** for each supported benchmark — use these to run `lerobot-eval` in a reproducible environment: + +| Benchmark | Dockerfile | +| ----------- | -------------------------------------------------------------------------------------- | +| LIBERO | [`docker/Dockerfile.benchmark.libero`](./docker/Dockerfile.benchmark.libero) | +| LIBERO+ | [`docker/Dockerfile.benchmark.libero_plus`](./docker/Dockerfile.benchmark.libero_plus) | +| MetaWorld | [`docker/Dockerfile.benchmark.metaworld`](./docker/Dockerfile.benchmark.metaworld) | +| RoboCasa | [`docker/Dockerfile.benchmark.robocasa`](./docker/Dockerfile.benchmark.robocasa) | +| RoboCerebra | [`docker/Dockerfile.benchmark.robocerebra`](./docker/Dockerfile.benchmark.robocerebra) | +| RoboMME | [`docker/Dockerfile.benchmark.robomme`](./docker/Dockerfile.benchmark.robomme) | +| RoboTwin | [`docker/Dockerfile.benchmark.robotwin`](./docker/Dockerfile.benchmark.robotwin) | +| VLABench | [`docker/Dockerfile.benchmark.vlabench`](./docker/Dockerfile.benchmark.vlabench) | + +Build and run (adapt to your benchmark): + +```bash +docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-bench-robomme . +docker run --gpus all --rm -it \ + -v $HOME/.cache/huggingface:/root/.cache/huggingface \ + lerobot-bench-robomme \ + lerobot-eval --policy.path= --env.type= --eval.n_episodes=50 +``` + +See [`docker/README.md`](./docker/README.md) for base-image details. + +### 8.3 Target success rates + +Single-task grasp-and-place with 50 clean episodes: ACT should reach **> 70% success** on the training configuration. Less → data problem (see §5), not model problem. Expect a drop when generalizing to new positions — scale episodes or diversity to recover. + +--- + +## 9. Further reading & resources + +- **Getting started:** [`installation.mdx`](./docs/source/installation.mdx) · [`il_robots.mdx`](./docs/source/il_robots.mdx) · [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets) +- **Per-policy docs:** browse [`docs/source/*.mdx`](./docs/source/) (policies, hardware, benchmarks, advanced training). +- **Community:** [Discord](https://discord.com/invite/s3KuuzsPFb) · [Hub `LeRobot` tag](https://huggingface.co/datasets?other=LeRobot) · [Dataset visualizer](https://huggingface.co/spaces/lerobot/visualize_dataset) + +> Keep this file current. If you learn a rule that would prevent a class of user mistakes, add it here and in [`AGENTS.md`](./AGENTS.md). diff --git a/docker/Dockerfile.benchmark.libero_plus b/docker/Dockerfile.benchmark.libero_plus new file mode 100644 index 000000000..5911329a4 --- /dev/null +++ b/docker/Dockerfile.benchmark.libero_plus @@ -0,0 +1,84 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for LIBERO-plus integration tests. +# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus +# fork source + its 6.4 GB perturbation assets. +# +# Build: docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus . +# Run: docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest +ENV MUJOCO_GL=egl + +# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras +# (wand / ImageMagick / fontconfig) not in the nightly base. +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + unzip libexpat1 libfontconfig1-dev libmagickwand-dev \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in +# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py. +# We install these explicitly instead of via the [libero_plus] extra because +# the extra's `libero @ git+...` dep installs as a namespace package and then +# clone and PYTHONPATH-override it below. +RUN uv pip install --no-cache \ + "robosuite==1.4.1" \ + "bddl==1.0.1" \ + "easydict==1.13" \ + "mujoco==3.7.0" \ + "matplotlib==3.10.8" \ + "Wand==0.6.13" \ + "scikit-image==0.25.2" \ + "gym==0.26.2" + +# Clone LIBERO-plus and make it importable as `libero`. The nightly base has +# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so +# Python resolves `import libero` to the 2402-task LIBERO-plus module instead. +# Pinned to the current upstream main SHA so benchmark builds stay reproducible. +ARG LIBERO_PLUS_SHA=4976dc3 +ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero +RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \ + && git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \ + && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \ + && (uv pip uninstall hf-libero 2>/dev/null || true) +ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}" + +# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via +# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All +# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's +# assets.zip (6.4 GB) under a deep author-internal prefix — extract and +# flatten it under ${LIBERO_PLUS_ROOT}/assets. +RUN python -c "\ +from huggingface_hub import hf_hub_download; \ +hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \ + filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \ + && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \ + && ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \ + && mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \ + && rm -rf /tmp/libero-plus-dl + +# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are +# non-interactive (it calls input() when the config is missing). +RUN mkdir -p /home/user_lerobot/.libero \ + && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robocasa b/docker/Dockerfile.benchmark.robocasa new file mode 100644 index 000000000..9de1612cb --- /dev/null +++ b/docker/Dockerfile.benchmark.robocasa @@ -0,0 +1,71 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCasa365 integration tests. +# Extends the nightly GPU image (which already has all extras installed) +# with the PR's source code and RoboCasa-specific asset setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa . +# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Install robocasa + robosuite as editable clones. pip-installing from git +# omits data files like robocasa/models/assets/box_links/box_links_assets.json +# (not declared in package_data), which download_kitchen_assets needs at import. +# +# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3` +# in install_requires, which would shadow the editable lerobot baked into +# this image. We install robocasa's actual runtime deps explicitly instead. +# Pinned SHAs for reproducible benchmark runs. Bump when you need an +# upstream fix; don't rely on `main`/`master` drift. +ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681 +ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab +RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \ + git -C ~/robocasa checkout ${ROBOCASA_SHA} && \ + git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \ + git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \ + uv pip install --no-cache -e ~/robocasa --no-deps && \ + uv pip install --no-cache -e ~/robosuite && \ + uv pip install --no-cache \ + "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \ + "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \ + "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \ + "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \ + "tianshou==0.4.10" "gymnasium==1.2.3" + +# Set up robocasa macros and download kitchen assets. We need: +# - tex : base environment textures +# - tex_generative : AI-generated textures; kitchen fixture XMLs embed +# refs to generative_textures/wall/tex*.png +# unconditionally, so MjModel.from_xml_string fails +# at reset time without them (even if the env is +# constructed with generative_textures=None). +# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...) +# - objs_lw : lightwheel object meshes (stools, misc props) +# We skip the objaverse/aigen object packs (~30GB combined) by pairing +# this with --env.obj_registries=["lightwheel"] on the lerobot side. +# The download script prompts interactively, so pipe 'y' to auto-accept. +RUN python -m robocasa.scripts.setup_macros && \ + yes y | python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +# Re-install lerobot editably so the new source (with RoboCasaEnv registration) +# replaces the stale package baked into the nightly image. +RUN uv pip install --no-cache --no-deps -e . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robocerebra b/docker/Dockerfile.benchmark.robocerebra new file mode 100644 index 000000000..9378bd66a --- /dev/null +++ b/docker/Dockerfile.benchmark.robocerebra @@ -0,0 +1,43 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCerebra integration tests. +# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different +# rename_map, so this image is identical to the LIBERO benchmark image — +# extends the nightly GPU base with LIBERO assets + the PR's source code. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra . +# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at +# runtime (which times out on CI). Point the libero config at the cached path. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing, +# so we write the config before any libero import can happen. +RUN LIBERO_DIR=$(python -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + python -c "\ +from huggingface_hub import snapshot_download; \ +snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ + local_dir='/home/user_lerobot/.libero/assets')" && \ + printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robomme b/docker/Dockerfile.benchmark.robomme new file mode 100644 index 000000000..2bfc83b4f --- /dev/null +++ b/docker/Dockerfile.benchmark.robomme @@ -0,0 +1,56 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboMME integration tests. +# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system +# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all] +# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which +# conflict with lerobot's defaults; both are safe at runtime: +# - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26) +# - numpy 1.26.4 is API-compatible with lerobot's actual usage. +# +# Build: docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme . +# Run: docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering. +ENV NVIDIA_DRIVER_CAPABILITIES=all \ + VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json + +# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image. +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libvulkan1 libvulkan-dev mesa-vulkan-drivers \ + && mkdir -p /usr/share/vulkan/icd.d \ + && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \ + > /usr/share/vulkan/icd.d/nvidia_icd.json \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top +# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its +# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml). +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \ + && uv pip install --no-cache --override /tmp/robomme_override.txt \ + -e ".[smolvla,av-dep]" \ + "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \ + && python -c "import robomme; print('robomme import OK')" + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robotwin b/docker/Dockerfile.benchmark.robotwin new file mode 100644 index 000000000..57ee21f4b --- /dev/null +++ b/docker/Dockerfile.benchmark.robotwin @@ -0,0 +1,138 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboTwin 2.0 integration tests. +# Extends the nightly GPU image with the RoboTwin simulator stack: +# sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip +# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval). +# +# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin . +# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \ +# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ... + +FROM huggingface/lerobot-gpu:latest + +ENV NVIDIA_DRIVER_CAPABILITIES=all \ + VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \ + ROBOTWIN_ROOT=/opt/robotwin + +# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's +# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both. +USER root +# Pinned upstream SHA for reproducible benchmark runs. Bump when we need +# an upstream fix; don't rely on `main` drift. +ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4 +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-nvcc-12-4 cuda-cudart-dev-12-4 \ + libvulkan1 vulkan-tools \ + && mkdir -p /usr/share/vulkan/icd.d \ + && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \ + > /usr/share/vulkan/icd.d/nvidia_icd.json \ + && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \ + && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \ + && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# RoboTwin runtime deps (av is already in the base via [av-dep]). +RUN uv pip install --no-cache \ + "sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \ + "open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py + +# pytorch3d has no universal wheel; must be built from source (~10 min, cached). +RUN uv pip install --no-cache --no-build-isolation \ + "git+https://github.com/facebookresearch/pytorch3d.git@stable" + +# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the +# build aborts on an empty arch list. RoboTwin's own installer pins v0.7.8, +# which still exposes the v1 API (`curobo.types.math`) that RoboTwin imports. +ARG CUROBO_REF=v0.7.8 +RUN cd ${ROBOTWIN_ROOT}/envs \ + && git clone --branch ${CUROBO_REF} --depth 1 https://github.com/NVlabs/curobo.git \ + && cd curobo \ + && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \ + uv pip install -e . --no-build-isolation --no-cache + +# Upstream patches (mirror RoboTwin's script/_install.sh). +# These patches target the exact versions pinned above; re-check when upgrading. +# mplib==0.2.1: drop a broken `or collide` clause in planner.py. +# Safe to remove once mplib > 0.2.1 ships with the fix upstream. +# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check. +# Safe to remove once sapien > 3.0.0b1 ships with the fix upstream. +RUN python - <<'EOF' +import pathlib, re, site +for d in site.getsitepackages(): + p = pathlib.Path(d) / "mplib" / "planner.py" + if p.exists(): + p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1)) + print(f"mplib patch applied: {p}") + p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py" + if p.exists(): + src = p.read_text().replace( + "with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:' + ).replace('"srdf"', '".srdf"') + p.write_text(src) + print(f"sapien patch applied: {p}") +EOF + +# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) + +# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped. +# The dataset is public — no auth token needed. +RUN python - <<'EOF' +import os, pathlib, zipfile +from huggingface_hub import hf_hub_download + +assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets" +assets_dir.mkdir(parents=True, exist_ok=True) +for fname in ("embodiments.zip", "objects.zip"): + local = hf_hub_download( + repo_id="TianxingChen/RoboTwin2.0", + repo_type="dataset", + filename=fname, + local_dir=str(assets_dir), + ) + with zipfile.ZipFile(local, "r") as z: + z.extractall(str(assets_dir)) + pathlib.Path(local).unlink() +EOF + +WORKDIR ${ROBOTWIN_ROOT} +RUN python script/update_embodiment_config_path.py + +ENV PYTHONPATH="${ROBOTWIN_ROOT}" + +# Fail the image build early if the CuRobo package layout regresses. Importing +# RoboTwin's planner here is too eager because CuRobo constructs CUDA-backed +# defaults at import time, while Docker builds don't have access to an NVIDIA +# driver. +RUN python - <<'EOF' +from pathlib import Path + +from curobo.types.math import Pose + +planner_src = (Path("/opt/robotwin/envs/robot/planner.py")).read_text() +assert "from curobo.types.math import Pose as CuroboPose" in planner_src + +print("CuRobo import OK:", Pose.__name__) +print("RoboTwin planner import references curobo.types.math") +EOF + +# Return to the lerobot source directory (set by base image) before overlaying. +WORKDIR /lerobot + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.vlabench b/docker/Dockerfile.benchmark.vlabench new file mode 100644 index 000000000..13502a3e3 --- /dev/null +++ b/docker/Dockerfile.benchmark.vlabench @@ -0,0 +1,99 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for VLABench integration tests. +# Extends the nightly GPU image with the PR's source code and VLABench setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench . +# Run: docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control. +# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI). +# Editable install (-e) because VLABench/utils/ has no __init__.py, so +# find_packages() omits it from wheels; editable mode uses the source tree directly. +# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py). +# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import +# time. Guard the call so missing dirs return [] instead of crashing (in case +# the asset download is partial). +# +# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need +# an upstream fix; don't rely on `main`/`develop` drift. +ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017 +ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85 +RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \ + git -C ~/VLABench checkout ${VLABENCH_SHA} && \ + git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \ + git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \ + python3 -c "\ +import pathlib; \ +p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \ +t = p.read_text(); \ +p.write_text(t.replace( \ + 'subdirs = os.listdir(xml_dir)', \ + 'if not os.path.isdir(xml_dir): return []\n subdirs = os.listdir(xml_dir)'))" && \ + uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \ + mujoco==3.2.2 dm-control==1.0.22 \ + open3d colorlog scikit-learn openai gdown + +# Download VLABench mesh assets. Task configs reference object meshes +# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without +# them the task builder picks from an empty mesh list and crashes with +# IndexError at task-build time (random.choice([]) in config_manager.py). +# +# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time +# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll +# snapshot_download the repo into VLABench's assets dir. This is the reliable +# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have +# viewed or downloaded this file recently") on shared academic files. +# +# After download we *validate* that at least one XML exists under each +# task-critical subtree and fail the build loudly if not. Silent-empty asset +# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them +# here rather than after a 10-minute eval build. +# +# Fallback: VLABench's own gdown-based script. Best-effort only. +ARG VLABENCH_ASSETS_REPO="" +RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \ + if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \ + echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \ + uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \ + python -c "from huggingface_hub import snapshot_download; \ +p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \ + local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \ +print('snapshot_download returned:', p)"; \ + else \ + echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \ + python ~/VLABench/scripts/download_assets.py --choice all; \ + fi && \ + python -c "\ +from pathlib import Path; \ +import sys; \ +root = Path('${ASSETS_DIR}'); \ +checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \ +failed = []; \ +print(f'Validating VLABench assets under {root}'); \ +[print(f' {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \ +[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \ +sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')" + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +# Re-install lerobot editably so the new source (with VLABenchEnv registration +# and updated obs handling) replaces the stale package baked into the nightly image. +RUN uv pip install --no-cache --no-deps -e . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 582d4fc14..01e8bfb76 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -79,10 +79,22 @@ title: Adding a New Benchmark - local: libero title: LIBERO + - local: libero_plus + title: LIBERO-plus - local: metaworld title: Meta-World + - local: robotwin + title: RoboTwin 2.0 + - local: robocasa + title: RoboCasa365 + - local: robocerebra + title: RoboCerebra + - local: robomme + title: RoboMME - local: envhub_isaaclab_arena title: NVIDIA IsaacLab Arena Environments + - local: vlabench + title: VLABench title: "Benchmarks" - sections: - local: introduction_processors diff --git a/docs/source/libero_plus.mdx b/docs/source/libero_plus.mdx new file mode 100644 index 000000000..4249bf49e --- /dev/null +++ b/docs/source/libero_plus.mdx @@ -0,0 +1,188 @@ +# LIBERO-plus + +LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss. + +- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626) +- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus) +- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus) + +![An overview of the LIBERO-plus benchmark perturbation dimensions](https://github.com/sylvestf/LIBERO-plus/raw/main/static/images/libero-plus.jpg) + +## Perturbation dimensions + +LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes: + +| Dimension | What changes | +| --------------------- | ----------------------------------------------------- | +| Objects layout | Target position, presence of confounding objects | +| Camera viewpoints | Camera position, orientation, field-of-view | +| Robot initial states | Manipulator start pose | +| Language instructions | LLM-rewritten task description (paraphrase / synonym) | +| Light conditions | Intensity, direction, color, shadow | +| Background textures | Scene surface and object appearance | +| Sensor noise | Photometric distortions and image degradation | + +## Available task suites + +LIBERO-plus covers the same five suites as LIBERO: + +| Suite | CLI name | Tasks | Max steps | Description | +| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- | +| LIBERO-Spatial | `libero_spatial` | 10 | 280 | Tasks requiring reasoning about spatial relations | +| LIBERO-Object | `libero_object` | 10 | 280 | Tasks centered on manipulating different objects | +| LIBERO-Goal | `libero_goal` | 10 | 300 | Goal-conditioned tasks with changing targets | +| LIBERO-90 | `libero_90` | 90 | 400 | Short-horizon tasks from the LIBERO-100 collection | +| LIBERO-Long | `libero_10` | 10 | 520 | Long-horizon tasks from the LIBERO-100 collection | + + + Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero` + so that `import libero` resolves to the LIBERO-plus fork. You cannot have both + installed at the same time. To switch back to vanilla LIBERO, uninstall the + fork and reinstall with `pip install -e ".[libero]"`. + + +## Installation + +### System dependencies (Linux only) + +```bash +sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev +``` + +### Python package + +```bash +pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym +git clone https://github.com/sylvestf/LIBERO-plus.git +cd LIBERO-plus && pip install --no-deps -e . +pip uninstall -y hf-libero # so `import libero` resolves to the fork +``` + +LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported. + + +Set the MuJoCo rendering backend before running evaluation: + +```bash +export MUJOCO_GL=egl # headless / HPC / cloud +``` + + + +### Download LIBERO-plus assets + +LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory: + +```bash +# After installing the package, find where it was installed: +python -c "import libero; print(libero.__file__)" +# Then extract assets.zip into /libero/assets/ +``` + +## Evaluation + +### Default evaluation (recommended) + +Evaluate across the four standard suites (10 episodes per task): + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial,libero_object,libero_goal,libero_10 \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --env.max_parallel_tasks=1 +``` + +### Single-suite evaluation + +Evaluate on one LIBERO-plus suite: + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.). +- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite. +- `--eval.batch_size` controls how many environments run in parallel. +- `--eval.n_episodes` sets how many episodes to run per task. + +### Multi-suite evaluation + +Benchmark a policy across multiple suites at once by passing a comma-separated list: + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial,libero_object \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +### Control mode + +LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy: + +```bash +--env.control_mode=relative # or "absolute" +``` + +### Policy inputs and outputs + +**Observations:** + +- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos) +- `observation.images.image` — main camera view (`agentview_image`), HWC uint8 +- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper + +### Recommended evaluation episodes + +For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results. + +## Training + +### Dataset + +A LeRobot-format training dataset for LIBERO-plus is available at: + +- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus) + +### Example training command + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_libero_plus \ + --policy.load_vlm_weights=true \ + --dataset.repo_id=lerobot/libero_plus \ + --env.type=libero_plus \ + --env.task=libero_spatial \ + --output_dir=./outputs/ \ + --steps=100000 \ + --batch_size=4 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval_freq=1000 +``` + +## Relationship to LIBERO + +LIBERO-plus is a drop-in extension of LIBERO: + +- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`) +- Same camera names and observation/action format +- Same task suite names +- Installs under the same `libero` Python package name (different GitHub repo) + +To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`. diff --git a/docs/source/robocasa.mdx b/docs/source/robocasa.mdx new file mode 100644 index 000000000..f6a784e72 --- /dev/null +++ b/docs/source/robocasa.mdx @@ -0,0 +1,188 @@ +# RoboCasa365 + +[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base). + +- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523) +- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa) +- Project website: [robocasa.ai](https://robocasa.ai) +- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) +- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge) + +RoboCasa365 benchmark overview + +## Available tasks + +RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts: + +| Family | Tasks | Description | +| --------- | ----- | ------------------------------------------------------------------------------- | +| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control | +| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. | + +**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`. + +**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more. + +`--env.task` accepts three forms: + +- a single task name (`CloseFridge`) +- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`) +- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`). + +## Installation + +RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin): + +```bash +# After following the standard LeRobot installation instructions. + +git clone https://github.com/robocasa/robocasa.git ~/robocasa +git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite +pip install -e ~/robocasa --no-deps +pip install -e ~/robosuite + +# Robocasa's runtime deps (the ones its setup.py would have pulled, minus +# the bad lerobot pin). +pip install numpy numba scipy mujoco pygame Pillow opencv-python \ + pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \ + tianshou gymnasium + +python -m robocasa.scripts.setup_macros +# Lightweight assets (lightwheel object meshes + textures). Enough for +# the default env out of the box. +python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw +# Optional: full objaverse/aigen registries (~30GB) for richer object +# variety. Enable at eval time via --env.obj_registries (see below). +# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse +``` + + +RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +### Object registries + +By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime: + +```bash +--env.obj_registries='[objaverse,lightwheel]' +``` + +## Evaluation + +All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on. + +### Single-task evaluation (recommended for quick iteration) + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Multi-task evaluation + +Pass a comma-separated list of tasks: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Benchmark-group evaluation + +Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`): + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=atomic_seen \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Recommended evaluation episodes + +**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results. + +## Policy inputs and outputs + +**Observations** (raw RoboCasa camera names are preserved verbatim): + +- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos) +- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8 +- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8 +- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D). + +## Training + +### Single-task example + +A ready-to-use single-task dataset is on the Hub: +[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge). + +Fine-tune a SmolVLA base on `CloseFridge`: + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \ + --policy.load_vlm_weights=true \ + --policy.push_to_hub=true \ + --dataset.repo_id=pepijn223/robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --output_dir=./outputs/smolvla_robocasa_CloseFridge \ + --steps=100000 \ + --batch_size=4 \ + --eval_freq=5000 \ + --eval.batch_size=1 \ + --eval.n_episodes=5 \ + --save_freq=10000 +``` + +Evaluate the resulting checkpoint: + +```bash +lerobot-eval \ + --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack. diff --git a/docs/source/robocerebra.mdx b/docs/source/robocerebra.mdx new file mode 100644 index 000000000..9776bd40f --- /dev/null +++ b/docs/source/robocerebra.mdx @@ -0,0 +1,99 @@ +# RoboCerebra + +[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF). + +- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677) +- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/) +- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks. +- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) + +## Available tasks + +RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite: + +| Suite | CLI name | Tasks | Description | +| --------- | ----------- | ----- | ------------------------------------------------------------- | +| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals | + +Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals. + +## Installation + +RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need: + +```bash +pip install -e ".[libero]" +``` + + +RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +## Evaluation + +RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \ + --policy.empty_cameras=1 +``` + +### Recommended evaluation episodes + +**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper. + +## Policy inputs and outputs + +**Observations:** + +- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper) +- `observation.images.image` — third-person view, 256×256 HWC uint8 +- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D) + +## Training + +The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations: + +| Feature | Shape | Description | +| -------------------------------- | ------------- | -------------------- | +| `observation.images.image` | (256, 256, 3) | Third-person view | +| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera | +| `observation.state` | (8,) | Joint pos + gripper | +| `action` | (7,) | EEF delta + gripper | + +Fine-tune a SmolVLA base on it: + +```bash +lerobot-train \ + --policy.path=lerobot/smolvla_base \ + --dataset.repo_id=lerobot/robocerebra_unified \ + --env.type=libero \ + --env.task=libero_10 \ + --output_dir=outputs/smolvla_robocerebra +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark. diff --git a/docs/source/robomme.mdx b/docs/source/robomme.mdx new file mode 100644 index 000000000..6613a3923 --- /dev/null +++ b/docs/source/robomme.mdx @@ -0,0 +1,130 @@ +# RoboMME + +[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation. + +- **16 tasks** across 4 memory-skill suites +- **1,600 training demos** (100 per task, 50 val, 50 test) +- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps +- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only + +![RoboMME benchmark tasks overview](https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2603.04639/gradient.png) + +## Tasks + +| Suite | Tasks | +| --------------------------------- | ------------------------------------------------------------- | +| **Counting** (temporal memory) | BinFill, PickXtimes, SwingXtimes, StopCube | +| **Permanence** (spatial memory) | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap | +| **Reference** (object memory) | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder | +| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick | + +## Installation + +> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts. + +### Native (Linux) + +```bash +pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \ + -e '.[smolvla,av-dep]' \ + 'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main' +``` + +> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely. + +### Docker (recommended) + +```bash +# Build base image first (from repo root) +docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base . + +# Build RoboMME eval image (applies gymnasium + numpy pin overrides) +docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme . +``` + +The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage. + +## Running Evaluation + +### Default (single task, single episode) + +```bash +lerobot-eval \ + --policy.path= \ + --env.type=robomme \ + --env.task=PickXtimes \ + --env.dataset_split=test \ + --env.task_ids=[0] \ + --eval.batch_size=1 \ + --eval.n_episodes=1 +``` + +### Multi-task evaluation + +Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split. + +```bash +lerobot-eval \ + --policy.path= \ + --env.type=robomme \ + --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \ + --env.dataset_split=test \ + --env.task_ids=[0,1,2,3,4,5,6,7,8,9] \ + --eval.batch_size=1 \ + --eval.n_episodes=50 +``` + +### Key CLI options for `env.type=robomme` + +| Option | Default | Description | +| -------------------- | ------------- | -------------------------------------------------- | +| `env.task` | `PickXtimes` | Any of the 16 task names above (comma-separated) | +| `env.dataset_split` | `test` | `train`, `val`, or `test` | +| `env.action_space` | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D) | +| `env.episode_length` | `300` | Max steps per episode | +| `env.task_ids` | `null` | List of episode indices to evaluate (null = `[0]`) | + +## Dataset + +The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly: + +```python +from lerobot.datasets.lerobot_dataset import LeRobotDataset + +dataset = LeRobotDataset("lerobot/robomme") +``` + +### Dataset features + +| Feature | Shape | Description | +| ------------------ | ------------- | ------------------------------- | +| `image` | (256, 256, 3) | Front camera RGB | +| `wrist_image` | (256, 256, 3) | Wrist camera RGB | +| `actions` | (8,) | Joint angles + gripper | +| `state` | (8,) | Joint positions + gripper state | +| `simple_subgoal` | str | High-level language annotation | +| `grounded_subgoal` | str | Grounded language annotation | +| `episode_index` | int | Episode ID | +| `frame_index` | int | Frame within episode | + +### Feature key alignment (training) + +The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`. + +The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning. + +## Action Spaces + +| Type | Dim | Description | +| ------------- | --- | --------------------------------------------------------- | +| `joint_angle` | 8 | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) | +| `ee_pose` | 7 | xyz + roll/pitch/yaw + gripper | + +Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`. + +## Platform Notes + +- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported. +- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed. +- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically. +- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package. diff --git a/docs/source/robotwin.mdx b/docs/source/robotwin.mdx new file mode 100644 index 000000000..ad1db766f --- /dev/null +++ b/docs/source/robotwin.mdx @@ -0,0 +1,223 @@ +# RoboTwin 2.0 + +RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions). + +- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088) +- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin) +- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard) +- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified) + +![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png) + +## Overview + +| Property | Value | +| ------------- | -------------------------------------------------------- | +| Tasks | 50 dual-arm manipulation tasks | +| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) | +| Action space | 14-dim joint-space, continuous in `[-1, 1]` | +| Cameras | `head_camera`, `left_camera`, `right_camera` | +| Simulator | SAPIEN (not MuJoCo) | +| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations | +| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) | + +## Available tasks + +RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks: + +| Task | CLI name | Category | +| ------------------------ | ------------------------ | ----------------- | +| Beat block with hammer | `beat_block_hammer` | Tool use | +| Click bell / alarm clock | `click_bell` | Precision press | +| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking | +| Stack bowls (2 / 3) | `stack_bowls_two/three` | Stacking | +| Handover block / mic | `handover_block` | Bimanual coord. | +| Lift pot | `lift_pot` | Bimanual lift | +| Shake bottle | `shake_bottle` | Continuous motion | +| Turn switch | `turn_switch` | Articulated obj | +| Stamp seal | `stamp_seal` | Precision place | +| Scan object | `scan_object` | Mobile manip. | + +Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep. + + + `open_laptop` is currently broken upstream (its `check_success()` uses + `self.arm_tag`, which is only set inside the scripted-expert `play_once()` + path and therefore unavailable during normal policy eval). Avoid it until the + upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in + `load_actors()`. + + +## Dataset + +The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub: + +``` +lerobot/robotwin_unified +``` + +It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels. + +You can load it directly with the HF Datasets library: + +```python +from datasets import load_dataset + +ds = load_dataset("lerobot/robotwin_unified", split="train") +``` + +## Installation + +RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes. + +### 1. Create a conda environment + +```bash +conda create -n robotwin python=3.10 -y +conda activate robotwin +``` + +### 2. Install LeRobot + +```bash +git clone https://github.com/huggingface/lerobot.git +cd lerobot +pip install -e "." +``` + +### 3. Install RoboTwin 2.0 + +```bash +git clone https://github.com/RoboTwin-Platform/RoboTwin.git +cd RoboTwin +bash script/_install.sh +bash script/_download_assets.sh +``` + +The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d. + + +If the automated install fails, install manually: + +```bash +pip install -r requirements.txt +pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" +cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo +pip install -e . --no-build-isolation +``` + +Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional. + + + +### 4. Add RoboTwin to PYTHONPATH + +The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory: + +```bash +export PYTHONPATH="${PYTHONPATH}:$(pwd)" +``` + +Add this to your shell profile to make it permanent. + +## Evaluation + +### Standard evaluation (recommended) + +Evaluate a policy on a single task with the official protocol (100 episodes): + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + +### Single-task quick check + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --eval.batch_size=1 \ + --eval.n_episodes=5 +``` + +### Multi-task sweep + +Evaluate on several tasks in one run: + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + +### Full benchmark (all 50 tasks) + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + + + `open_laptop` is intentionally omitted above because of the upstream + `self.arm_tag` bug (see the **Available tasks** section). Re-add it once the + upstream fix lands. + + +## Camera configuration + +By default, all three cameras are included: + +| Camera key | Description | +| -------------- | ------------------------------ | +| `head_camera` | Torso-mounted overhead view | +| `left_camera` | Left arm wrist-mounted camera | +| `right_camera` | Right arm wrist-mounted camera | + +To use a subset of cameras, override `--env.camera_names`: + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --env.camera_names="head_camera,left_camera" \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +## Environment config reference + +Key parameters for `RoboTwinEnvConfig`: + +| Parameter | Default | Description | +| -------------------- | ---------------------------------------- | ---------------------------------- | +| `task` | `"beat_block_hammer"` | Comma-separated task name(s) | +| `fps` | `25` | Simulation FPS | +| `episode_length` | `300` | Max steps per episode | +| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` | +| `camera_names` | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras | +| `observation_height` | `240` | Camera pixel height | +| `observation_width` | `320` | Camera pixel width | + +## Leaderboard submission + +Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires: + +- Training on 50 `demo_clean` demonstrations per task +- Evaluating 100 episodes per task +- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings + +For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/). diff --git a/docs/source/vlabench.mdx b/docs/source/vlabench.mdx new file mode 100644 index 000000000..da579d674 --- /dev/null +++ b/docs/source/vlabench.mdx @@ -0,0 +1,176 @@ +# VLABench + +[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below). + +- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194) +- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) +- Project website: [vlabench.github.io](https://vlabench.github.io) +- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) + +VLABench benchmark overview + +## Available tasks + +VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface: + +| Suite | CLI name | Tasks | Description | +| --------- | ----------- | ----- | ---------------------------------------------------------------- | +| Primitive | `primitive` | 21 | Single / few-skill combinations (select, insert, physics QA) | +| Composite | `composite` | 22 | Multi-step reasoning and long-horizon planning (cook, rearrange) | + +**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`). + +**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`. + +`--env.task` accepts three forms: + +- a single task name (`select_fruit`) +- a comma-separated list (`select_fruit,heat_food`) +- a suite shortcut (`primitive`, `composite`, or `primitive,composite`) + +## Installation + +VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets: + +```bash +# After following the standard LeRobot installation instructions. + +git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench +git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms +pip install -e ~/VLABench -e ~/rrt-algorithms +pip install "mujoco==3.2.2" "dm-control==1.0.22" \ + open3d colorlog scikit-learn openai gdown + +python ~/VLABench/scripts/download_assets.py +``` + + +VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +## Evaluation + +All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on. + +### Single-task evaluation (recommended for quick iteration) + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Multi-task evaluation + +Pass a comma-separated list of tasks: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=select_fruit,select_toy,add_condiment,heat_food \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Suite-wide evaluation + +Run an entire suite (all 21 primitives or all 22 composites): + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=primitive \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + --env.max_parallel_tasks=1 \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +Or both suites: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_vlabench \ + --env.type=vlabench \ + --env.task=primitive,composite \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + --env.max_parallel_tasks=1 \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}' +``` + +### Recommended evaluation episodes + +**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper. + +## Policy inputs and outputs + +**Observations:** + +- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper) +- `observation.images.image` — front camera, 480×480 HWC uint8 +- `observation.images.second_image` — second camera, 480×480 HWC uint8 +- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper. + +## Training + +### Datasets + +Pre-collected VLABench datasets in LeRobot format on the Hub: + +- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images. +- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images. + +### Example training command + +Fine-tune a SmolVLA base on the primitive suite: + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \ + --policy.load_vlm_weights=true \ + --policy.push_to_hub=true \ + --dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \ + --env.type=vlabench \ + --env.task=select_fruit \ + --output_dir=./outputs/smolvla_vlabench_primitive \ + --steps=100000 \ + --batch_size=4 \ + --eval_freq=5000 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --save_freq=10000 +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark. diff --git a/pyproject.toml b/pyproject.toml index 8f2f0606b..d3d0c0ed3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -212,6 +212,20 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"] pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"] metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"] +# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution +# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI +# release), so any `vlabench>=X` pip spec is unresolvable. Install it +# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx +# for the recipe. +# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2 +# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't +# resolve into a single env. Install it only in the RoboMME Docker image +# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme). +# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins +# `lerobot==0.3.3` in install_requires, which cyclically shadows our own +# workspace `lerobot` and makes the graph unsolvable under any resolver +# (uv, pip). Install it manually alongside robosuite — see +# docs/source/robocasa.mdx for the recipe. # All all = [ diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 5fbc1c35a..0d27885cf 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -31,9 +31,23 @@ from __future__ import annotations import argparse import json +import re import sys from pathlib import Path +# LIBERO-plus derives task.language by space-joining the perturbation-variant +# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py), +# so non-_language_ variants inherit a trailing metadata blob like +# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so +# the description matches the base instruction used in the training dataset. +_LIBERO_PERTURBATION_TAIL_RE = re.compile( + r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$" +) + + +def _strip_libero_perturbation_tail(instruction: str) -> str: + return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip() + def _libero_descriptions(task_suite: str) -> dict[str, str]: from libero.libero import benchmark # type: ignore[import-untyped] @@ -47,7 +61,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]: ) return {} suite = suite_dict[task_suite]() - return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)} + return { + f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language) + for i in range(suite.n_tasks) + } def _metaworld_descriptions(task_name: str) -> dict[str, str]: @@ -57,19 +74,120 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]: return {f"{task_name}_0": label} +def _robotwin_descriptions(task_names: str) -> dict[str, str]: + """Return descriptions for each requested RoboTwin task. Reads + `description/task_instruction/.json` from the RoboTwin clone + (cwd is /opt/robotwin in CI). Falls back to the task name if missing.""" + out: dict[str, str] = {} + root = Path("description/task_instruction") + for name in (t.strip() for t in task_names.split(",") if t.strip()): + desc_file = root / f"{name}.json" + desc = name.replace("_", " ") + if desc_file.is_file(): + data = json.loads(desc_file.read_text()) + full = data.get("full_description") or desc + # Strip the schema placeholders ({A}, {a}) — keep the sentence readable. + desc = full.replace("<", "").replace(">", "") + out[f"{name}_0"] = desc + return out + + +def _robocasa_descriptions(task_spec: str) -> dict[str, str]: + """For each task in the comma-separated list, emit a cleaned-name label. + + RoboCasa episodes carry their language instruction in the env's + `ep_meta['lang']`, populated per reset. Pulling it requires spinning + up the full kitchen env per task (~seconds each); we use the task + name as the key here and let the eval's episode info carry the + actual instruction. + """ + out: dict[str, str] = {} + for task in (t.strip() for t in task_spec.split(",") if t.strip()): + # Split CamelCase into words: "CloseFridge" → "close fridge". + label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip() + out[f"{task}_0"] = label or task + return out + + +_ROBOMME_DESCRIPTIONS = { + "BinFill": "Fill the target bin with the correct number of cubes", + "PickXtimes": "Pick the indicated cube the specified number of times", + "SwingXtimes": "Swing the object the specified number of times", + "StopCube": "Grasp and stop the moving cube", + "VideoUnmask": "Pick the cube shown in the reference video", + "VideoUnmaskSwap": "Pick the cube matching the reference video after a swap", + "ButtonUnmask": "Press the button indicated by the reference", + "ButtonUnmaskSwap": "Press the correct button after objects are swapped", + "PickHighlight": "Pick the highlighted cube", + "VideoRepick": "Repick the cube shown in the reference video", + "VideoPlaceButton": "Place the cube on the button shown in the video", + "VideoPlaceOrder": "Place cubes in the order shown in the video", + "MoveCube": "Move the cube to the target location", + "InsertPeg": "Insert the peg into the target hole", + "PatternLock": "Unlock the pattern by pressing buttons in sequence", + "RouteStick": "Route the stick through the required waypoints", +} + + +def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]: + """Return descriptions for each requested RoboMME task. Keys match the + video filename pattern `_` used by the eval script.""" + if task_ids is None: + task_ids = [0] + out: dict[str, str] = {} + for name in (t.strip() for t in task_names.split(",") if t.strip()): + desc = _ROBOMME_DESCRIPTIONS.get(name, name) + for tid in task_ids: + out[f"{name}_{tid}"] = desc + return out + + +def _vlabench_descriptions(task_spec: str) -> dict[str, str]: + """For each task in the comma-separated list, emit a cleaned-name label. + + VLABench tasks carry language instructions on their dm_control task + object, but pulling them requires loading the full env per task + (~seconds each). The CI smoke-eval already captures the instruction + inside its episode info; this mapping is just enough to key + `metrics.json` by `_0`. + """ + out: dict[str, str] = {} + for task in (t.strip() for t in task_spec.split(",") if t.strip()): + out[f"{task}_0"] = task.replace("_", " ").strip() + return out + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)") + parser.add_argument( + "--task-ids", + type=str, + default=None, + help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]", + ) parser.add_argument("--output", required=True, help="Path to write task_descriptions.json") args = parser.parse_args() + task_ids: list[int] | None = None + if args.task_ids: + task_ids = [int(x.strip()) for x in args.task_ids.split(",")] + descriptions: dict[str, str] = {} try: - if args.env == "libero": + if args.env == ("libero", "libero_plus"): descriptions = _libero_descriptions(args.task) elif args.env == "metaworld": descriptions = _metaworld_descriptions(args.task) + elif args.env == "robotwin": + descriptions = _robotwin_descriptions(args.task) + elif args.env == "robocasa": + descriptions = _robocasa_descriptions(args.task) + elif args.env == "robomme": + descriptions = _robomme_descriptions(args.task, task_ids=task_ids) + elif args.env == "vlabench": + descriptions = _vlabench_descriptions(args.task) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.", diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 2a7c52d45..84c40472f 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig): camera_name_mapping: dict[str, str] | None = None observation_height: int = 360 observation_width: int = 360 + is_libero_plus: bool = False features: dict[str, PolicyFeature] = field( default_factory=lambda: { ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)), @@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig): control_mode=self.control_mode, episode_length=self.episode_length, camera_name_mapping=self.camera_name_mapping, + is_libero_plus=self.is_libero_plus, ) def get_env_processors(self): @@ -496,6 +498,146 @@ class MetaworldEnv(EnvConfig): ) +@EnvConfig.register_subclass("robocasa") +@dataclass +class RoboCasaEnv(EnvConfig): + task: str = "CloseFridge" + fps: int = 20 + episode_length: int = 1000 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right" + observation_height: int = 256 + observation_width: int = 256 + visualization_height: int = 512 + visualization_width: int = 512 + split: str | None = None + # Object-mesh registries to sample from. Upstream default is + # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image + # only ships the lightwheel pack. Override to include objaverse once + # you've run `python -m robocasa.scripts.download_kitchen_assets + # --type objaverse` locally. + obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"]) + features: dict[str, PolicyFeature] = field( + default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))} + ) + features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE}) + + def __post_init__(self): + if self.obs_type not in ("pixels", "pixels_agent_pos"): + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + # Preserve raw RoboCasa camera names end-to-end (e.g. + # `observation.images.robot0_agentview_left`). This matches the + # naming convention used by the RoboCasa datasets on the Hub, so + # trained policies don't need a `--rename_map` at eval time. + cams = [c.strip() for c in self.camera_name.split(",") if c.strip()] + for cam in cams: + self.features[f"pixels/{cam}"] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.observation_height, self.observation_width, 3), + ) + self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}" + + if self.obs_type == "pixels_agent_pos": + self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,)) + + @property + def gym_kwargs(self) -> dict: + kwargs: dict[str, Any] = { + "obs_type": self.obs_type, + "render_mode": self.render_mode, + "observation_height": self.observation_height, + "observation_width": self.observation_width, + "visualization_height": self.visualization_height, + "visualization_width": self.visualization_width, + } + if self.split is not None: + kwargs["split"] = self.split + return kwargs + + def create_envs(self, n_envs: int, use_async_envs: bool = False): + from .robocasa import create_robocasa_envs + + if self.task is None: + raise ValueError("RoboCasaEnv requires a task to be specified") + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robocasa_envs( + task=self.task, + n_envs=n_envs, + camera_name=self.camera_name, + gym_kwargs=self.gym_kwargs, + env_cls=env_cls, + episode_length=self.episode_length, + obj_registries=tuple(self.obj_registries), + ) + + +@EnvConfig.register_subclass("vlabench") +@dataclass +class VLABenchEnv(EnvConfig): + task: str = "select_fruit" + fps: int = 10 + episode_length: int = 500 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + render_resolution: tuple[int, int] = (480, 480) + robot: str = "franka" + action_mode: str = "eef" + features: dict[str, PolicyFeature] = field( + default_factory=lambda: { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)), + } + ) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "agent_pos": OBS_STATE, + "pixels/image": f"{OBS_IMAGES}.image", + "pixels/second_image": f"{OBS_IMAGES}.second_image", + "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image", + } + ) + + def __post_init__(self): + h, w = self.render_resolution + if self.obs_type == "pixels": + self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + elif self.obs_type == "pixels_agent_pos": + self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3)) + self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,)) + else: + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + @property + def gym_kwargs(self) -> dict: + return { + "obs_type": self.obs_type, + "render_mode": self.render_mode, + "render_resolution": self.render_resolution, + "robot": self.robot, + "max_episode_steps": self.episode_length, + "action_mode": self.action_mode, + } + + def create_envs(self, n_envs: int, use_async_envs: bool = False): + from .vlabench import create_vlabench_envs + + if self.task is None: + raise ValueError("VLABenchEnv requires a task to be specified") + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_vlabench_envs( + task=self.task, + n_envs=n_envs, + gym_kwargs=self.gym_kwargs, + env_cls=env_cls, + ) + + @EnvConfig.register_subclass("isaaclab_arena") @dataclass class IsaaclabArenaEnv(HubEnvConfig): @@ -574,3 +716,171 @@ class IsaaclabArenaEnv(HubEnvConfig): ), PolicyProcessorPipeline(steps=[]), ) + + +@EnvConfig.register_subclass("libero_plus") +@dataclass +class LiberoPlusEnv(LiberoEnv): + """Config for LIBERO-plus robustness benchmark evaluation. + + LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints, + object layouts, robot initial states, language instructions, lighting, background + textures, sensor noise) producing ~10k task variants. + + The gym interface is identical to LIBERO so this class reuses ``LiberoEnv`` + entirely — only the registered name and default task suite differ. + + Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships + as a namespace package from a git fork and must be cloned + PYTHONPATH'd + rather than installed as a pyproject extra. + + See Also: + https://github.com/sylvestf/LIBERO-plus + """ + + task: str = "libero_spatial" + is_libero_plus: bool = True + + +@EnvConfig.register_subclass("robotwin") +@dataclass +class RoboTwinEnvConfig(EnvConfig): + """Configuration for RoboTwin 2.0 benchmark environments. + + RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the + SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF + (7 per arm). All three cameras are enabled by default. + + See: https://robotwin-platform.github.io + Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified + """ + + task: str = "beat_block_hammer" # single task or comma-separated list + fps: int = 25 + episode_length: int = 300 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera + # (torso-mounted) + left_camera / right_camera (wrists). + camera_names: str = "head_camera,left_camera,right_camera" + # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml). + # Gym's vector-env concatenate pre-allocates buffers of this shape, so it + # must equal what SAPIEN actually renders. + observation_height: int = 240 + observation_width: int = 320 + features: dict[str, PolicyFeature] = field( + default_factory=lambda: { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)), + } + ) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "pixels/head_camera": f"{OBS_IMAGES}.head_camera", + "pixels/left_camera": f"{OBS_IMAGES}.left_camera", + "pixels/right_camera": f"{OBS_IMAGES}.right_camera", + "agent_pos": OBS_STATE, + } + ) + + def __post_init__(self): + cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()] + for cam in cam_list: + self.features[f"pixels/{cam}"] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.observation_height, self.observation_width, 3), + ) + # Keep features_map entry if already set (default_factory); add if missing. + key = f"pixels/{cam}" + if key not in self.features_map: + self.features_map[key] = f"{OBS_IMAGES}.{cam}" + + if self.obs_type == "pixels_agent_pos": + self.features["agent_pos"] = PolicyFeature( + type=FeatureType.STATE, + shape=(14,), # 14 DOF: 7 per arm + ) + elif self.obs_type != "pixels": + raise ValueError( + f"Unsupported obs_type '{self.obs_type}'. " + "RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'." + ) + + @property + def gym_kwargs(self) -> dict: + return {} + + def create_envs(self, n_envs: int, use_async_envs: bool = True): + from lerobot.envs.robotwin import create_robotwin_envs + + if not self.task: + raise ValueError("RoboTwinEnvConfig requires `task` to be specified.") + + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()] + return create_robotwin_envs( + task=self.task, + n_envs=n_envs, + env_cls=env_cls, + camera_names=cam_list, + observation_height=self.observation_height, + observation_width=self.observation_width, + episode_length=self.episode_length, + ) + + +@EnvConfig.register_subclass("robomme") +@dataclass +class RoboMMEEnv(EnvConfig): + """RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN). + + 16 tasks across 4 suites: Counting, Permanence, Reference, Imitation. + Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes). + Benchmark: https://github.com/RoboMME/robomme_benchmark + + Requires the `robomme` git package installed separately (Linux only); + see docker/Dockerfile.benchmark.robomme for the canonical install. + """ + + task: str = "PickXtimes" + fps: int = 10 + episode_length: int = 300 + action_space: str = "joint_angle" # or "ee_pose" (7-D) + dataset_split: str = "test" # "train" | "val" | "test" + task_ids: list[int] | None = None + features: dict[str, PolicyFeature] = field(default_factory=dict) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "pixels/image": f"{OBS_IMAGES}.image", + "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image", + "agent_pos": OBS_STATE, + } + ) + + def __post_init__(self): + action_dim = 8 if self.action_space == "joint_angle" else 7 + self.features = { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)), + "pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + "pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + "agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)), + } + + @property + def gym_kwargs(self) -> dict: + return {} + + def create_envs(self, n_envs: int, use_async_envs: bool = True): + from lerobot.envs.robomme import create_robomme_envs + + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robomme_envs( + task=self.task, + n_envs=n_envs, + action_space_type=self.action_space, + dataset=self.dataset_split, + episode_length=self.episode_length, + task_ids=self.task_ids, + env_cls=env_cls, + ) diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index ec90d0ffd..12be9e196 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -16,6 +16,7 @@ from __future__ import annotations import os +import re from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial @@ -31,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv from lerobot.types import RobotObservation -from .utils import _LazyAsyncVectorEnv - - -def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: - """Normalize camera_name into a non-empty list of strings.""" - if isinstance(camera_name, str): - cams = [c.strip() for c in camera_name.split(",") if c.strip()] - elif isinstance(camera_name, (list | tuple)): - cams = [str(c).strip() for c in camera_name if str(c).strip()] - else: - raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") - if not cams: - raise ValueError("camera_name resolved to an empty list.") - return cams +from .utils import _LazyAsyncVectorEnv, parse_camera_names def _get_suite(name: str) -> benchmark.Benchmark: @@ -69,14 +57,34 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i return ids -def get_task_init_states(task_suite: Any, i: int) -> np.ndarray: - init_states_path = ( - Path(get_libero_path("init_states")) - / task_suite.tasks[i].problem_folder - / task_suite.tasks[i].init_states_file - ) - init_states = torch.load(init_states_path, weights_only=False) # nosec B614 - return init_states +# LIBERO-plus perturbation variants encode the perturbation in the filename +# but on disk only the base `.pruned_init` exists — strip the suffix to match +# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we +# can pass weights_only=False for PyTorch 2.6+ numpy pickles). +_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+") + + +def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray: + task = task_suite.tasks[i] + filename = Path(task.init_states_file) + root = Path(get_libero_path("init_states")) + + if not is_libero_plus: + init_states_path = root / task.problem_folder / filename.name + return torch.load(init_states_path, weights_only=False) # nosec B614 + + # LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under + # libero_newobj/ as a flat array that must be reshaped to (1, -1). + if "_add_" in filename.name or "_level" in filename.name: + init_states_path = root / "libero_newobj" / task.problem_folder / filename.name + init_states = torch.load(init_states_path, weights_only=False) # nosec B614 + return init_states.reshape(1, -1) + + # LIBERO-plus perturbation variants encode the perturbation in the filename + # but on disk only the base `.pruned_init` exists — strip the suffix to match. + stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix + init_states_path = root / task.problem_folder / stripped + return torch.load(init_states_path, weights_only=False) # nosec B614 def get_libero_dummy_action(): @@ -118,9 +126,11 @@ class LiberoEnv(gym.Env): camera_name_mapping: dict[str, str] | None = None, num_steps_wait: int = 10, control_mode: str = "relative", + is_libero_plus: bool = False, ): super().__init__() self.task_id = task_id + self.is_libero_plus = is_libero_plus self.obs_type = obs_type self.render_mode = render_mode self.observation_width = observation_width @@ -128,7 +138,7 @@ class LiberoEnv(gym.Env): self.visualization_width = visualization_width self.visualization_height = visualization_height self.init_states = init_states - self.camera_name = _parse_camera_names( + self.camera_name = parse_camera_names( camera_name ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) @@ -147,7 +157,11 @@ class LiberoEnv(gym.Env): self.episode_index = episode_index self.episode_length = episode_length # Load once and keep - self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None + self._init_states = ( + get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus) + if self.init_states + else None + ) self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`. self.init_state_id = self.episode_index # tie each sub-env to a fixed init state @@ -380,6 +394,7 @@ def _make_env_fns( gym_kwargs: Mapping[str, Any], control_mode: str, camera_name_mapping: dict[str, str] | None = None, + is_libero_plus: bool = False, ) -> list[Callable[[], LiberoEnv]]: """Build n_envs factory callables for a single (suite, task_id).""" @@ -396,6 +411,7 @@ def _make_env_fns( n_envs=n_envs, control_mode=control_mode, camera_name_mapping=camera_name_mapping, + is_libero_plus=is_libero_plus, **local_kwargs, ) @@ -418,6 +434,7 @@ def create_libero_envs( control_mode: str = "relative", episode_length: int | None = None, camera_name_mapping: dict[str, str] | None = None, + is_libero_plus: bool = False, ) -> dict[str, dict[int, Any]]: """ Create vectorized LIBERO environments with a consistent return shape. @@ -437,7 +454,7 @@ def create_libero_envs( gym_kwargs = dict(gym_kwargs or {}) task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks - camera_names = _parse_camera_names(camera_name) + camera_names = parse_camera_names(camera_name) suite_names = [s.strip() for s in str(task).split(",") if s.strip()] if not suite_names: raise ValueError("`task` must contain at least one LIBERO suite name.") @@ -462,6 +479,7 @@ def create_libero_envs( # Probe once and reuse to avoid creating a temp env per task. cached_obs_space: spaces.Space | None = None cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None for tid in selected: fns = _make_env_fns( @@ -475,12 +493,14 @@ def create_libero_envs( gym_kwargs=gym_kwargs, control_mode=control_mode, camera_name_mapping=camera_name_mapping, + is_libero_plus=is_libero_plus, ) if is_async: - lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space) + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) if cached_obs_space is None: cached_obs_space = lazy.observation_space cached_act_space = lazy.action_space + cached_metadata = lazy.metadata out[suite_name][tid] = lazy else: out[suite_name][tid] = env_cls(fns) diff --git a/src/lerobot/envs/metaworld.py b/src/lerobot/envs/metaworld.py index 1dc513a68..bffcf6b6e 100644 --- a/src/lerobot/envs/metaworld.py +++ b/src/lerobot/envs/metaworld.py @@ -311,6 +311,7 @@ def create_metaworld_envs( is_async = env_cls is gym.vector.AsyncVectorEnv cached_obs_space = None cached_act_space = None + cached_metadata = None out: dict[str, dict[int, Any]] = defaultdict(dict) for group in task_groups: @@ -324,10 +325,11 @@ def create_metaworld_envs( fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)] if is_async: - lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space) + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) if cached_obs_space is None: cached_obs_space = lazy.observation_space cached_act_space = lazy.action_space + cached_metadata = lazy.metadata out[group][tid] = lazy else: out[group][tid] = env_cls(fns) diff --git a/src/lerobot/envs/robocasa.py b/src/lerobot/envs/robocasa.py new file mode 100644 index 000000000..a84a7c766 --- /dev/null +++ b/src/lerobot/envs/robocasa.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv, parse_camera_names + +logger = logging.getLogger(__name__) + +# Dimensions for the flat action/state vectors used by the LeRobot wrapper. +# These correspond to the PandaOmron robot in RoboCasa365. +OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2) +ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) +ACTION_LOW = -1.0 +ACTION_HIGH = 1.0 + +# Default PandaOmron cameras. We surface these raw names directly as +# `observation.images.` so the LeRobot dataset/policy keys match +# RoboCasa's native convention (no implicit renaming). +DEFAULT_CAMERAS = [ + "robot0_agentview_left", + "robot0_eye_in_hand", + "robot0_agentview_right", +] + +# Object-mesh registries to sample from. RoboCasa's upstream default is +# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and +# most users — including our CI image — only download the lightwheel pack +# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object +# category has zero candidates in every registry, robocasa crashes with +# `ValueError: Probabilities contain NaN` (0/0 divide in the probability +# normalization). Restricting to registries that are actually on disk +# avoids the NaN and matches what the asset download provides. +DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",) + +# Task-group shortcuts accepted as `--env.task`. When the user passes one of +# these names, we expand it to the upstream RoboCasa task list and auto-set +# the dataset split. Individual task names (optionally comma-separated) still +# take precedence; this only triggers on an exact group-name match. +_TASK_GROUP_SPLITS = { + "atomic_seen": "target", + "composite_seen": "target", + "composite_unseen": "target", + "pretrain50": "pretrain", + "pretrain100": "pretrain", + "pretrain200": "pretrain", + "pretrain300": "pretrain", +} + + +def _resolve_tasks(task: str) -> tuple[list[str], str | None]: + """Resolve a `--env.task` value to (task_names, split_override). + + If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`), + expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS` + and return the matching split. Otherwise treat `task` as a single task or + comma-separated list and leave the split untouched (None). + """ + key = task.strip() + if key in _TASK_GROUP_SPLITS: + from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS + + combined = {**TARGET_TASKS, **PRETRAINING_TASKS} + if key not in combined: + raise ValueError( + f"Task group '{key}' is not available in this version of robocasa. " + f"Known groups: {sorted(combined.keys())}." + ) + return list(combined[key]), _TASK_GROUP_SPLITS[key] + + names = [t.strip() for t in task.split(",") if t.strip()] + if not names: + raise ValueError("`task` must contain at least one RoboCasa task name.") + return names, None + + +def convert_action(flat_action: np.ndarray) -> dict[str, Any]: + """Split a flat (12,) action vector into a RoboCasa action dict. + + Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) + """ + return { + "action.base_motion": flat_action[0:4], + "action.control_mode": flat_action[4:5], + "action.end_effector_position": flat_action[5:8], + "action.end_effector_rotation": flat_action[8:11], + "action.gripper_close": flat_action[11:12], + } + + +class RoboCasaEnv(gym.Env): + """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments. + + Wraps RoboCasaGymEnv from the robocasa package and converts its + dict-based observations and actions into the flat arrays LeRobot expects. + Raw RoboCasa camera names are preserved verbatim under `pixels/`. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 20} + + def __init__( + self, + task: str, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + obs_type: str = "pixels_agent_pos", + render_mode: str = "rgb_array", + observation_width: int = 256, + observation_height: int = 256, + visualization_width: int = 512, + visualization_height: int = 512, + split: str | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, + episode_index: int = 0, + ): + super().__init__() + self.task = task + self.obs_type = obs_type + self.render_mode = render_mode + self.observation_width = observation_width + self.observation_height = observation_height + self.visualization_width = visualization_width + self.visualization_height = visualization_height + self.split = split + self.obj_registries = tuple(obj_registries) + # Per-worker index (0..n_envs-1) used to spread the user-provided + # seed across factories so each sub-env explores a distinct layout + # even when the same seed is passed to `reset()`. + self.episode_index = int(episode_index) + + self.camera_name = parse_camera_names(camera_name) + + self._max_episode_steps = episode_length if episode_length is not None else 1000 + + # Deferred — created on first reset() inside the worker subprocess + # to avoid inheriting stale GPU/EGL contexts across fork(). + self._env: Any = None + self.task_description = "" + + images = { + cam: spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + for cam in self.camera_name + } + + if self.obs_type == "pixels": + self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)}) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + "agent_pos": spaces.Box( + low=-np.inf, + high=np.inf, + shape=(OBS_STATE_DIM,), + dtype=np.float32, + ), + } + ) + else: + raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.") + + self.action_space = spaces.Box( + low=ACTION_LOW, + high=ACTION_HIGH, + shape=(ACTION_DIM,), + dtype=np.float32, + ) + + def _ensure_env(self) -> None: + """Create the underlying RoboCasaGymEnv on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own clean rendering context rather than inheriting a stale one from + the parent process (which causes crashes with AsyncVectorEnv). + """ + if self._env is not None: + return + from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv + + # RoboCasaGymEnv defaults split="test", which create_env rejects + # (only None/"all"/"pretrain"/"target" are valid). Always pass a + # valid value so we don't hit that default. Extra kwargs are + # forwarded to the underlying kitchen env via create_env/robosuite.make. + self._env = RoboCasaGymEnv( + env_name=self.task, + camera_widths=self.observation_width, + camera_heights=self.observation_height, + split=self.split if self.split is not None else "all", + obj_registries=self.obj_registries, + ) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + def _format_raw_obs(self, raw_obs: dict) -> RobotObservation: + """Convert RoboCasaGymEnv observation dict to LeRobot format.""" + # RoboCasaGymEnv emits camera frames under "video.". + images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs} + + if self.obs_type == "pixels": + return {"pixels": images} + + # `state.*` keys come from PandaOmronKeyConverter inside the wrapper. + agent_pos = np.concatenate( + [ + raw_obs.get("state.base_position", np.zeros(3)), + raw_obs.get("state.base_rotation", np.zeros(4)), + raw_obs.get("state.end_effector_position_relative", np.zeros(3)), + raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)), + raw_obs.get("state.gripper_qpos", np.zeros(2)), + ], + axis=-1, + ).astype(np.float32) + + return {"pixels": images, "agent_pos": agent_pos} + + def render(self) -> np.ndarray: + self._ensure_env() + assert self._env is not None + return self._env.render() + + def reset(self, seed=None, **kwargs): + self._ensure_env() + assert self._env is not None + super().reset(seed=seed) + # Spread the seed across workers so n_envs factories don't all + # roll the same scene. With an explicit user seed we shift it by + # episode_index; with no seed we fall back to episode_index so + # each worker is still distinct rather than inheriting the same + # global RNG state. + worker_seed = seed + self.episode_index if seed is not None else self.episode_index + raw_obs, info = self._env.reset(seed=worker_seed) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + observation = self._format_raw_obs(raw_obs) + info = {"is_success": False} + return observation, info + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + self._ensure_env() + assert self._env is not None + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) + + action_dict = convert_action(action) + raw_obs, reward, done, truncated, info = self._env.step(action_dict) + + is_success = bool(info.get("success", False)) + terminated = done or is_success + info.update({"task": self.task, "done": done, "is_success": is_success}) + + observation = self._format_raw_obs(raw_obs) + if terminated: + info["final_info"] = { + "task": self.task, + "done": bool(done), + "is_success": bool(is_success), + } + self.reset() + + return observation, reward, terminated, truncated, info + + def close(self): + if self._env is not None: + self._env.close() + + +def _make_env_fns( + *, + task: str, + n_envs: int, + camera_names: list[str], + obs_type: str, + render_mode: str, + observation_width: int, + observation_height: int, + visualization_width: int, + visualization_height: int, + split: str | None, + episode_length: int | None, + obj_registries: Sequence[str], +) -> list[Callable[[], RoboCasaEnv]]: + """Build n_envs factory callables for a single task. + + Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so + ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the + user-provided seed. + """ + + def _make_env(episode_index: int) -> RoboCasaEnv: + return RoboCasaEnv( + task=task, + camera_name=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + episode_index=episode_index, + ) + + return [partial(_make_env, i) for i in range(n_envs)] + + +def create_robocasa_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] | None = None, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, +) -> dict[str, dict[int, Any]]: + """Create vectorized RoboCasa365 environments with a consistent return shape. + + Returns: + dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + + `task` can be: + - a single task name (e.g. `CloseFridge`) + - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`) + - a benchmark-group shortcut (`atomic_seen`, `composite_seen`, + `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, + `pretrain300`), which auto-expands to the upstream task list and + auto-sets the dataset `split` ("target" or "pretrain"). + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + gym_kwargs = dict(gym_kwargs or {}) + obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos") + render_mode = gym_kwargs.pop("render_mode", "rgb_array") + observation_width = gym_kwargs.pop("observation_width", 256) + observation_height = gym_kwargs.pop("observation_height", 256) + visualization_width = gym_kwargs.pop("visualization_width", 512) + visualization_height = gym_kwargs.pop("visualization_height", 512) + split = gym_kwargs.pop("split", None) + + camera_names = parse_camera_names(camera_name) + task_names, group_split = _resolve_tasks(str(task)) + if group_split is not None and split is None: + split = group_split + + logger.info( + "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d", + task_names, + split, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + out: dict[str, dict[int, Any]] = defaultdict(dict) + + for task_name in task_names: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + camera_names=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + ) + + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[task_name][0] = lazy + else: + out[task_name][0] = env_cls(fns) + logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs) + + return {name: dict(task_map) for name, task_map in out.items()} diff --git a/src/lerobot/envs/robomme.py b/src/lerobot/envs/robomme.py new file mode 100644 index 000000000..69d665bd4 --- /dev/null +++ b/src/lerobot/envs/robomme.py @@ -0,0 +1,245 @@ +"""RoboMME environment wrapper for LeRobot evaluation. + +Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible +``VectorEnv`` suitable for ``lerobot_eval``. + +RoboMME tasks: + Counting: BinFill, PickXtimes, SwingXtimes, StopCube + Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap + Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder + Imitation: MoveCube, InsertPeg, PatternLock, RouteStick + +Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes) +Install: see docker/Dockerfile.benchmark.robomme (Linux only — mani-skill vs numpy pin conflict) +Benchmark: https://github.com/RoboMME/robomme_benchmark +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +from .utils import _LazyAsyncVectorEnv + +ROBOMME_TASKS = [ + "BinFill", + "PickXtimes", + "SwingXtimes", + "StopCube", + "VideoUnmask", + "VideoUnmaskSwap", + "ButtonUnmask", + "ButtonUnmaskSwap", + "PickHighlight", + "VideoRepick", + "VideoPlaceButton", + "VideoPlaceOrder", + "MoveCube", + "InsertPeg", + "PatternLock", + "RouteStick", +] + + +class RoboMMEGymEnv(gym.Env): + """Thin Gymnasium wrapper around a single RoboMME episode env.""" + + metadata = {"render_modes": ["rgb_array"], "render_fps": 10} + + def __init__( + self, + task: str = "PickXtimes", + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_idx: int = 0, + max_steps: int = 300, + ): + super().__init__() + from robomme.env_record_wrapper import BenchmarkEnvBuilder + + self._task = task + self._action_space_type = action_space_type + self._dataset = dataset + self._episode_idx = episode_idx + self._max_steps = max_steps + self._max_episode_steps = max_steps + + self._builder = BenchmarkEnvBuilder( + env_id=task, + dataset=dataset, + action_space=action_space_type, + gui_render=False, + max_steps=max_steps, + ) + self._env = None + self._last_raw_obs: dict | None = None + + action_dim = 8 if action_space_type == "joint_angle" else 7 + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32) + # `pixels` must be a nested Dict so `preprocess_observation()` in + # envs/utils.py picks it up and maps each camera to + # `observation.images.`. A flat layout (`pixels/image`, + # `pixels/wrist_image`) silently drops every image from the batch. + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + } + ), + "agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32), + } + ) + + def reset(self, *, seed=None, options=None): + super().reset(seed=seed) + self._env = self._builder.make_env_for_episode( + episode_idx=self._episode_idx, + max_steps=self._max_steps, + ) + obs, info = self._env.reset() + self._last_raw_obs = obs + return self._convert_obs(obs), self._convert_info(info) + + def step(self, action): + obs, reward, terminated, truncated, info = self._env.step(action) + self._last_raw_obs = obs + + terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated) + truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated) + + status = info.get("status", "ongoing") + is_success = status == "success" + conv_info = self._convert_info(info) + conv_info["is_success"] = is_success + + return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info + + def render(self) -> np.ndarray | None: + """Return the front camera image from the last observation for video recording.""" + if self._last_raw_obs is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + front = self._last_raw_obs.get("front_rgb_list") + if front is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + frame = front[-1] if isinstance(front, list) else front + return np.asarray(frame, dtype=np.uint8) + + def _convert_obs(self, obs: dict) -> dict: + front_rgb = ( + obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"] + ) + wrist_rgb = ( + obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"] + ) + joint_state = ( + obs["joint_state_list"][-1] + if isinstance(obs["joint_state_list"], list) + else obs["joint_state_list"] + ) + gripper_state = ( + obs["gripper_state_list"][-1] + if isinstance(obs["gripper_state_list"], list) + else obs["gripper_state_list"] + ) + + front_rgb = np.asarray(front_rgb, dtype=np.uint8) + wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8) + joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7] + gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1] + state = np.concatenate([joint, gripper]) + + return { + "pixels": {"image": front_rgb, "wrist_image": wrist_rgb}, + "agent_pos": state, + } + + def _convert_info(self, info: dict) -> dict: + return { + "status": info.get("status", "ongoing"), + "task_goal": info.get("task_goal", ""), + } + + +def _make_env_fns( + *, + task: str, + n_envs: int, + action_space_type: str, + dataset: str, + episode_length: int, + task_id: int, +) -> list[Callable[[], RoboMMEGymEnv]]: + """Build n_envs factory callables for one RoboMME task id.""" + + def _make_one(episode_index: int) -> RoboMMEGymEnv: + return RoboMMEGymEnv( + task=task, + action_space_type=action_space_type, + dataset=dataset, + episode_idx=episode_index, + max_steps=episode_length, + ) + + return [partial(_make_one, task_id + i) for i in range(n_envs)] + + +def create_robomme_envs( + task: str, + n_envs: int = 1, + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_length: int = 300, + task_ids: list[int] | None = None, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, +) -> dict[str, dict[int, gym.vector.VectorEnv]]: + """Create vectorized RoboMME environments for evaluation. + + `task` may be a single RoboMME task name (e.g. "PickXtimes") or a + comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task + becomes its own suite in the returned mapping. + + Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format. + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of env factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + if task_ids is None: + task_ids = [0] + + task_names = [t.strip() for t in task.split(",") if t.strip()] + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + out: dict[str, dict[int, gym.vector.VectorEnv]] = {} + for task_name in task_names: + envs_by_task: dict[int, gym.vector.VectorEnv] = {} + for task_id in task_ids: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + action_space_type=action_space_type, + dataset=dataset, + episode_length=episode_length, + task_id=task_id, + ) + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + envs_by_task[task_id] = lazy + else: + envs_by_task[task_id] = env_cls(fns) + out[task_name] = envs_by_task + return out diff --git a/src/lerobot/envs/robotwin.py b/src/lerobot/envs/robotwin.py new file mode 100644 index 000000000..823f14fa0 --- /dev/null +++ b/src/lerobot/envs/robotwin.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import importlib +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +import torch +from gymnasium import spaces + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv + +logger = logging.getLogger(__name__) + +# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking +# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb"). +ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = ( + "head_camera", + "left_camera", + "right_camera", +) + +ACTION_DIM = 14 # 7 DOF × 2 arms +ACTION_LOW = -1.0 +ACTION_HIGH = 1.0 +DEFAULT_EPISODE_LENGTH = 300 +# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects). +DEFAULT_CAMERA_H = 240 +DEFAULT_CAMERA_W = 320 + +# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly +# (50 tasks as of main; earlier revisions had 60 with a different split). +# Keep this in sync with: +# gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \ +# | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//' +ROBOTWIN_TASKS: tuple[str, ...] = ( + "adjust_bottle", + "beat_block_hammer", + "blocks_ranking_rgb", + "blocks_ranking_size", + "click_alarmclock", + "click_bell", + "dump_bin_bigbin", + "grab_roller", + "handover_block", + "handover_mic", + "hanging_mug", + "lift_pot", + "move_can_pot", + "move_pillbottle_pad", + "move_playingcard_away", + "move_stapler_pad", + "open_laptop", + "open_microwave", + "pick_diverse_bottles", + "pick_dual_bottles", + "place_a2b_left", + "place_a2b_right", + "place_bread_basket", + "place_bread_skillet", + "place_burger_fries", + "place_can_basket", + "place_cans_plasticbox", + "place_container_plate", + "place_dual_shoes", + "place_empty_cup", + "place_fan", + "place_mouse_pad", + "place_object_basket", + "place_object_scale", + "place_object_stand", + "place_phone_stand", + "place_shoe", + "press_stapler", + "put_bottles_dustbin", + "put_object_cabinet", + "rotate_qrcode", + "scan_object", + "shake_bottle", + "shake_bottle_horizontally", + "stack_blocks_three", + "stack_blocks_two", + "stack_bowls_three", + "stack_bowls_two", + "stamp_seal", + "turn_switch", +) + + +_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {} + + +def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]: + """Build the kwargs dict RoboTwin's setup_demo expects. + + Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``: + reads ``task_config/demo_clean.yml``, resolves the embodiment file from + ``_embodiment_config.yml``, loads the robot's own ``config.yml``, and + reads camera dimensions from ``_camera_config.yml``. + + Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment + used by beat_block_hammer and most smoke-test tasks). + """ + if task_name in _ROBOTWIN_SETUP_CACHE: + return dict(_ROBOTWIN_SETUP_CACHE[task_name]) + + import os + + import yaml # type: ignore[import-untyped] + from envs import CONFIGS_PATH # type: ignore[import-not-found] + + task_config = "demo_clean" + with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f: + args = yaml.safe_load(f) + + # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot) + with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f: + embodiment_types = yaml.safe_load(f) + embodiment = args.get("embodiment", ["aloha-agilex"]) + if len(embodiment) == 1: + robot_file = embodiment_types[embodiment[0]]["file_path"] + args["left_robot_file"] = robot_file + args["right_robot_file"] = robot_file + args["dual_arm_embodied"] = True + elif len(embodiment) == 3: + args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"] + args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"] + args["embodiment_dis"] = embodiment[2] + args["dual_arm_embodied"] = False + else: + raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}") + + with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f: + args["left_embodiment_config"] = yaml.safe_load(f) + with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f: + args["right_embodiment_config"] = yaml.safe_load(f) + + # Camera dimensions + with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f: + camera_config = yaml.safe_load(f) + head_cam = args["camera"]["head_camera_type"] + args["head_camera_h"] = camera_config[head_cam]["h"] + args["head_camera_w"] = camera_config[head_cam]["w"] + + # Headless overrides + args["render_freq"] = 0 + args["task_name"] = task_name + args["task_config"] = task_config + + _ROBOTWIN_SETUP_CACHE[task_name] = args + return dict(args) + + +def _load_robotwin_task(task_name: str) -> type: + """Dynamically import and return a RoboTwin 2.0 task class. + + RoboTwin tasks live in ``envs/.py`` relative to the repository + root and are expected to be on ``sys.path`` after installation. + """ + try: + module = importlib.import_module(f"envs.{task_name}") + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + f"Could not import RoboTwin task '{task_name}'. " + "Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. " + "See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html" + ) from e + task_cls = getattr(module, task_name, None) + if task_cls is None: + raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py") + return task_cls + + +class RoboTwinEnv(gym.Env): + """Gymnasium wrapper around a single RoboTwin 2.0 task. + + RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` / + ``take_action`` / ``check_success``) rather than the standard gym interface. + This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive + RoboTwin exactly like LIBERO or Meta-World. + + The underlying SAPIEN environment is created lazily on the first ``reset()`` + call *inside the worker process*. This is required for + ``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU + contexts that must not be forked from the parent process. + + Observations + ------------ + The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g. + ``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in + ``envs/utils.py`` then converts these to ``observation.images.``. + + Actions + ------- + 14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm). + + Autograd + -------- + ``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory + optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps + the rollout in ``torch.no_grad()``, so both call sites re-enable grad. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 25} + + def __init__( + self, + task_name: str, + episode_index: int = 0, + n_envs: int = 1, + camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES, + observation_height: int | None = None, + observation_width: int | None = None, + episode_length: int = DEFAULT_EPISODE_LENGTH, + render_mode: str = "rgb_array", + ): + super().__init__() + self.task_name = task_name + self.task = task_name # used by add_envs_task() in utils.py + self.task_description = task_name.replace("_", " ") + self.episode_index = episode_index + self._reset_stride = n_envs + self.camera_names = list(camera_names) + # Default to D435 dims (the camera type baked into task_config/demo_clean.yml). + # The YAML-driven lookup is deferred to reset() so construction doesn't + # import RoboTwin's `envs` module — fast-tests run without RoboTwin installed. + self.observation_height = observation_height or DEFAULT_CAMERA_H + self.observation_width = observation_width or DEFAULT_CAMERA_W + self.episode_length = episode_length + self._max_episode_steps = episode_length # lerobot_eval.rollout reads this + self.render_mode = render_mode + + self._env: Any | None = None # deferred — created on first reset() inside worker + self._step_count: int = 0 + self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8) + + image_spaces = { + cam: spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + for cam in self.camera_names + } + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(image_spaces), + "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32), + } + ) + self.action_space = spaces.Box( + low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32 + ) + + def _ensure_env(self) -> None: + """Create the SAPIEN environment on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own EGL/GPU context rather than inheriting a stale one from the + parent process (which causes crashes with AsyncVectorEnv). + """ + if self._env is not None: + return + task_cls = _load_robotwin_task(self.task_name) + self._env = task_cls() + + def _get_obs(self) -> RobotObservation: + assert self._env is not None, "_get_obs called before _ensure_env()" + raw = self._env.get_obs() + cameras_raw = raw.get("observation", {}) + + images: dict[str, np.ndarray] = {} + for cam in self.camera_names: + cam_data = cameras_raw.get(cam) + img = cam_data.get("rgb") if cam_data else None + if img is None: + images[cam] = self._black_frame + continue + img = np.asarray(img, dtype=np.uint8) + if img.ndim == 2: + img = np.stack([img, img, img], axis=-1) + elif img.shape[-1] != 3: + img = img[..., :3] + images[cam] = img + + ja = raw.get("joint_action") or {} + vec = ja.get("vector") + if vec is not None: + arr = np.asarray(vec, dtype=np.float32).ravel() + joint_state = ( + arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32) + ) + else: + joint_state = np.zeros(ACTION_DIM, dtype=np.float32) + + return {"pixels": images, "agent_pos": joint_state} + + def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]: + self._ensure_env() + super().reset(seed=seed) + assert self._env is not None # set by _ensure_env() above + + actual_seed = self.episode_index if seed is None else seed + setup_kwargs = _load_robotwin_setup_kwargs(self.task_name) + setup_kwargs.update(seed=actual_seed, is_test=True) + with torch.enable_grad(): + self._env.setup_demo(**setup_kwargs) + self.episode_index += self._reset_stride + self._step_count = 0 + + obs = self._get_obs() + return obs, {"is_success": False, "task": self.task_name} + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + assert self._env is not None, "step() called before reset()" + if action.ndim != 1 or action.shape[0] != ACTION_DIM: + raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}") + + with torch.enable_grad(): + if hasattr(self._env, "take_action"): + self._env.take_action(action) + else: + self._env.step(action) + + self._step_count += 1 + + is_success = bool(getattr(self._env, "eval_success", False)) + if not is_success and hasattr(self._env, "check_success"): + is_success = bool(self._env.check_success()) + + obs = self._get_obs() + reward = float(is_success) + terminated = is_success + truncated = self._step_count >= self.episode_length + + info: dict[str, Any] = { + "task": self.task_name, + "is_success": is_success, + "step": self._step_count, + } + if terminated or truncated: + info["final_info"] = { + "task": self.task_name, + "is_success": is_success, + } + self.reset() + + return obs, reward, terminated, truncated, info + + def render(self) -> np.ndarray: + self._ensure_env() + obs = self._get_obs() + # Prefer head camera for rendering; fall back to first available. + if "head_camera" in obs["pixels"]: + return obs["pixels"]["head_camera"] + return next(iter(obs["pixels"].values())) + + def close(self) -> None: + if self._env is not None: + if hasattr(self._env, "close_env"): + import contextlib + + with contextlib.suppress(TypeError): + self._env.close_env() + self._env = None + + +# ---- Multi-task factory -------------------------------------------------------- + + +def _make_env_fns( + *, + task_name: str, + n_envs: int, + camera_names: list[str], + observation_height: int, + observation_width: int, + episode_length: int, +) -> list[Callable[[], RoboTwinEnv]]: + """Return n_envs factory callables for a single task.""" + + def _make_one(episode_index: int) -> RoboTwinEnv: + return RoboTwinEnv( + task_name=task_name, + episode_index=episode_index, + n_envs=n_envs, + camera_names=camera_names, + observation_height=observation_height, + observation_width=observation_width, + episode_length=episode_length, + ) + + return [partial(_make_one, i) for i in range(n_envs)] + + +def create_robotwin_envs( + task: str, + n_envs: int, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES, + observation_height: int = DEFAULT_CAMERA_H, + observation_width: int = DEFAULT_CAMERA_W, + episode_length: int = DEFAULT_EPISODE_LENGTH, +) -> dict[str, dict[int, Any]]: + """Create vectorized RoboTwin 2.0 environments. + + Returns: + ``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping + ``n_envs`` parallel rollouts. + + Args: + task: Comma-separated list of task names (e.g. ``"beat_block_hammer"`` + or ``"beat_block_hammer,click_bell"``). + n_envs: Number of parallel rollouts per task. + env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``). + camera_names: Cameras to include in observations. + observation_height: Pixel height for all cameras. + observation_width: Pixel width for all cameras. + episode_length: Max steps before truncation. + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + task_names = [t.strip() for t in str(task).split(",") if t.strip()] + if not task_names: + raise ValueError("`task` must contain at least one RoboTwin task name.") + + unknown = [t for t in task_names if t not in ROBOTWIN_TASKS] + if unknown: + raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}") + + logger.info( + "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d", + task_names, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + + out: dict[str, dict[int, Any]] = defaultdict(dict) + for task_name in task_names: + fns = _make_env_fns( + task_name=task_name, + n_envs=n_envs, + camera_names=list(camera_names), + observation_height=observation_height, + observation_width=observation_width, + episode_length=episode_length, + ) + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[task_name][0] = lazy + else: + out[task_name][0] = env_cls(fns) + logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs) + + return {k: dict(v) for k, v in out.items()} diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index b0d834a05..6e6f352e9 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape from .configs import EnvConfig +def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: + """Normalize ``camera_name`` into a non-empty list of strings. + + Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of + strings (tuples/lists). Whitespace is stripped; empty entries are + dropped. Raises ``TypeError`` for unsupported input types and + ``ValueError`` when the normalized list is empty. + """ + if isinstance(camera_name, str): + cams = [c.strip() for c in camera_name.split(",") if c.strip()] + elif isinstance(camera_name, (list | tuple)): + cams = [str(c).strip() for c in camera_name if str(c).strip()] + else: + raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") + if not cams: + raise ValueError("camera_name resolved to an empty list.") + return cams + + def _convert_nested_dict(d): result = {} for k, v in d.items(): @@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv: env_fns: list[Callable], observation_space=None, action_space=None, + metadata=None, ): self._env_fns = env_fns self._env: gym.vector.AsyncVectorEnv | None = None self.num_envs = len(env_fns) - if observation_space is not None and action_space is not None: + if observation_space is not None and action_space is not None and metadata is not None: self.observation_space = observation_space self.action_space = action_space + self.metadata = metadata else: tmp = env_fns[0]() self.observation_space = tmp.observation_space self.action_space = tmp.action_space + self.metadata = tmp.metadata tmp.close() self.single_observation_space = self.observation_space self.single_action_space = self.action_space @@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv: if self._env is None: self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True) + @property + def unwrapped(self): + return self + def reset(self, **kwargs): self._ensure() return self._env.reset(**kwargs) diff --git a/src/lerobot/envs/vlabench.py b/src/lerobot/envs/vlabench.py new file mode 100644 index 000000000..922973a16 --- /dev/null +++ b/src/lerobot/envs/vlabench.py @@ -0,0 +1,589 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VLABench environment wrapper for LeRobot. + +VLABench is a large-scale benchmark for language-conditioned robotic manipulation +with long-horizon reasoning, built on MuJoCo/dm_control. + +- Paper: https://arxiv.org/abs/2412.18194 +- GitHub: https://github.com/OpenMOSS/VLABench +- Website: https://vlabench.github.io +""" + +from __future__ import annotations + +import contextlib +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from typing import Any + +import cv2 +import gymnasium as gym +import numpy as np +from gymnasium import spaces +from scipy.spatial.transform import Rotation + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv + +logger = logging.getLogger(__name__) + +ACTION_DIM = 7 # pos(3) + euler(3) + gripper(1) +ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32) +ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32) + +# Default max episode steps per task type +DEFAULT_MAX_EPISODE_STEPS = 500 + +# VLABench task suites +PRIMITIVE_TASKS = [ + "select_fruit", + "select_toy", + "select_chemistry_tube", + "add_condiment", + "select_book", + "select_painting", + "select_drink", + "insert_flower", + "select_billiards", + "select_ingredient", + "select_mahjong", + "select_poker", + # Physical series + "density_qa", + "friction_qa", + "magnetism_qa", + "reflection_qa", + "simple_cuestick_usage", + "simple_seesaw_usage", + "sound_speed_qa", + "thermal_expansion_qa", + "weight_qa", +] + +COMPOSITE_TASKS = [ + "cluster_billiards", + "cluster_book", + "cluster_drink", + "cluster_toy", + "cook_dishes", + "cool_drink", + "find_unseen_object", + "get_coffee", + "hammer_nail", + "heat_food", + "make_juice", + "play_mahjong", + "play_math_game", + "play_poker", + "play_snooker", + "rearrange_book", + "rearrange_chemistry_tube", + "set_dining_table", + "set_study_table", + "store_food", + "take_chemistry_experiment", + "use_seesaw_complex", +] + +SUITE_TASKS: dict[str, list[str]] = { + "primitive": PRIMITIVE_TASKS, + "composite": COMPOSITE_TASKS, +} + + +class VLABenchEnv(gym.Env): + """Gymnasium wrapper for VLABench environments. + + Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface. + Supports multiple cameras (front, second, wrist) and end-effector control. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 10} + + def __init__( + self, + task: str = "select_fruit", + obs_type: str = "pixels_agent_pos", + render_mode: str = "rgb_array", + render_resolution: tuple[int, int] = (480, 480), + robot: str = "franka", + max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS, + action_mode: str = "eef", + ): + super().__init__() + self.task = task + self.obs_type = obs_type + self.render_mode = render_mode + self.render_resolution = render_resolution + self.robot = robot + self._max_episode_steps = max_episode_steps + self.action_mode = action_mode + + # Deferred — created on first reset() inside worker subprocess to avoid + # inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers. + # We never cache `env.physics`: dm_control exposes it as a weakref + # proxy that goes stale across resets (rebuilds the sim), so we always + # refetch it via `self._env.physics` at the call site. + self._env = None + self.task_description = "" # populated on first reset + # Cached world-frame XYZ of the robot base link. The VLABench datasets + # log both `observation.state` positions and `actions` positions in + # robot-base frame (see VLABench/scripts/convert_to_lerobot.py which + # subtracts `robot_frame_pos` from ee_pos). The robot is attached at a + # fixed offset per task so this is safe to cache once per env build. + self._robot_base_xyz: np.ndarray | None = None + + h, w = self.render_resolution + + if self.obs_type == "state": + raise NotImplementedError( + "The 'state' observation type is not supported in VLABenchEnv. " + "Please use 'pixels' or 'pixels_agent_pos'." + ) + elif self.obs_type == "pixels": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + } + ), + } + ) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8), + } + ), + "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64), + } + ) + else: + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32) + + # Max attempts to rebuild the underlying env when MuJoCo throws + # `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step + # reset warm-up. Some random task/layout samples land in unstable + # initial configurations; re-sampling the layout almost always + # gives a stable one. A handful of upstream tasks (notably + # `select_mahjong`) have layout samplers that diverge often enough + # to need >>5 retries, so we pick a generous ceiling. + _ENSURE_ENV_MAX_ATTEMPTS = 20 + + def _ensure_env(self) -> None: + """Create the underlying VLABench env on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own clean rendering context rather than inheriting a stale one from + the parent process (which causes crashes with AsyncVectorEnv). + + Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20 + warm-up `step()` calls while toggling gravity/fluids to let the scene + settle; for some random layouts MuJoCo's integrator diverges and + raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields + a stable one, so we retry a number of times before giving up. Between + attempts we reseed NumPy's global RNG from OS entropy so the upstream + task sampler explores fresh initial states — without this, retries + can replay the same diverging configuration when the sampler is + deterministic given the current RNG state. + """ + if self._env is not None: + return + + import VLABench.robots # noqa: F401 # type: ignore[import-untyped] + import VLABench.tasks # noqa: F401 # type: ignore[import-untyped] + from dm_control.rl.control import PhysicsError # type: ignore[import-untyped] + from VLABench.envs import load_env # type: ignore[import-untyped] + + h, w = self.render_resolution + last_exc: PhysicsError | None = None + for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1): + try: + env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w)) + self._env = env + break + except PhysicsError as exc: + last_exc = exc + logger.warning( + "PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…", + attempt, + self._ENSURE_ENV_MAX_ATTEMPTS, + self.task, + exc, + ) + np.random.seed(None) + if self._env is None: + assert last_exc is not None + raise RuntimeError( + f"VLABench task '{self.task}' failed to produce a stable " + f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} " + f"attempts. This task's upstream sampler diverges too " + f"often for the configured robot; consider removing it " + f"from the eval set. Last physics error: {last_exc}" + ) from last_exc + + # Extract task description from the dm_control task + task_obj = self._env.task + if hasattr(task_obj, "task_description"): + self.task_description = task_obj.task_description + elif hasattr(task_obj, "language_instruction"): + self.task_description = task_obj.language_instruction + else: + self.task_description = self.task + + # Cache robot base world position so `_build_ctrl_from_action` and + # `_get_obs` can translate between robot-frame (dataset) and + # world-frame (dm_control) without hitting physics every call. + try: + self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape( + 3 + ) + except Exception: + # Fallback to VLABench's default Franka base position. + self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64) + + def _get_obs(self) -> dict: + """Get current observation from the environment.""" + assert self._env is not None + + obs = self._env.get_observation() + h, w = self.render_resolution + + def _to_hwc3(arr: np.ndarray) -> np.ndarray: + """Coerce any camera array to the declared (h, w, 3) uint8 shape.""" + a = np.asarray(arr) + # Drop a leading singleton batch dim if present. + while a.ndim > 3 and a.shape[0] == 1: + a = a[0] + if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4): + # CHW → HWC + a = np.transpose(a, (1, 2, 0)) + if a.ndim == 2: + a = np.stack([a] * 3, axis=-1) + if a.ndim != 3: + return np.zeros((h, w, 3), dtype=np.uint8) + # Force 3 channels. + if a.shape[-1] == 1: + a = np.repeat(a, 3, axis=-1) + elif a.shape[-1] == 4: + a = a[..., :3] + elif a.shape[-1] != 3: + return np.zeros((h, w, 3), dtype=np.uint8) + if a.shape[:2] != (h, w): + a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA) + return a.astype(np.uint8) + + # Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays + raw_frames: list[np.ndarray] = [] + if "rgb" in obs: + rgb = obs["rgb"] + if isinstance(rgb, np.ndarray): + if rgb.ndim == 4: + raw_frames = [rgb[i] for i in range(rgb.shape[0])] + elif rgb.ndim == 3: + raw_frames = [rgb] + + image_keys = ["image", "second_image", "wrist_image"] + images: dict[str, np.ndarray] = {} + for i, key in enumerate(image_keys): + if i < len(raw_frames): + images[key] = _to_hwc3(raw_frames[i]) + else: + images[key] = np.zeros((h, w, 3), dtype=np.uint8) + + # Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]` + # to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3), + # gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions + # are stored in robot-base frame and orientations as scipy extrinsic + # 'xyz' euler angles. + raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel() + pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64) + quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64) + gripper = float(raw[7]) if raw.size >= 8 else 0.0 + + base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64) + pos_robot = pos_world - base + euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler( + "xyz", degrees=False + ) + + ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64) + + if self.obs_type == "pixels": + return {"pixels": images} + elif self.obs_type == "pixels_agent_pos": + return { + "pixels": images, + "agent_pos": ee_state.astype(np.float64), + } + else: + raise ValueError(f"Unknown obs_type: {self.obs_type}") + + # ---- Action adaptation (EEF → joint ctrl) -------------------------------- + # + # The HF vlabench datasets log 7D actions + # `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`, + # exactly matching VLABench's own eval pipeline (evaluator.base): + # pos, euler, g = policy(...) + # quat = euler_to_quaternion(*euler) # extrinsic xyz -> wxyz + # _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat) + # env.step(np.concatenate([qpos, [g, g]])) + # + # VLABench's dm_control task writes `data.ctrl[:] = action` directly — for + # Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the + # above conversion so the policy's EEF commands actually drive the robot. + + _FRANKA_FINGER_OPEN = 0.04 # qpos when gripper fully open + + def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray: + """Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector. + + For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) + + 2 gripper finger qposes (open/closed based on the gripper scalar). + If the action is already joint-space (shape matches ctrl_dim), pass + through. + """ + if action.shape[0] == ctrl_dim: + return action.astype(np.float64, copy=False) + + if action.shape[0] != 7: + # Unknown layout — fall back to zero-pad so the sim doesn't crash. + padded = np.zeros(ctrl_dim, dtype=np.float64) + padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim] + return padded + + from dm_control.utils.inverse_kinematics import qpos_from_site_pose + + # Action position is in robot-base frame (see convert_to_lerobot.py); + # dm_control's IK expects a world-frame target. + base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64) + pos_world = np.asarray(action[:3], dtype=np.float64) + base + rx, ry, rz = float(action[3]), float(action[4]), float(action[5]) + gripper = float(np.clip(action[6], 0.0, 1.0)) + + # Dataset euler is scipy extrinsic 'xyz' (same as VLABench's + # `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK + # and MuJoCo use `[w, x, y, z]`, so reorder. + qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat() + quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64) + + assert self._env is not None + robot = self._env.task.robot + site_name = robot.end_effector_site.full_identifier + + # inplace=False so IK doesn't mutate physics state mid-step — we only + # want the solved qpos. Fetch a fresh physics handle — caching it can + # yield a stale weakref after a reset. + ik_result = qpos_from_site_pose( + self._env.physics, + site_name=site_name, + target_pos=pos_world, + target_quat=quat, + inplace=False, + max_steps=100, + ) + n_dof = robot.n_dof # 7 for Franka + arm_qpos = ik_result.qpos[:n_dof] + + # Dataset gripper convention: 1 = open (finger qpos = 0.04), + # 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py + # where `trajectory[i][-1] > 0.03` is encoded as `1`. + finger_qpos = gripper * self._FRANKA_FINGER_OPEN + + ctrl = np.zeros(ctrl_dim, dtype=np.float64) + ctrl[:n_dof] = arm_qpos + # Remaining entries are gripper fingers (usually 2 for Franka). + ctrl[n_dof:] = finger_qpos + return ctrl + + def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]: + self._ensure_env() + assert self._env is not None + super().reset(seed=seed) + + if seed is not None: + self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1))) + + self._env.reset() + + observation = self._get_obs() + info = {"is_success": False} + return observation, info + + def _seed_inner_env(self, seed: int) -> None: + """Propagate `seed` to the inner dm_control env. `Environment.reset()` + doesn't accept a seed, so we re-seed the task and environment + `RandomState`s directly. Best-effort: silently skipped when the + expected attributes are absent on a given VLABench version. + """ + for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")): + owner = getattr(self._env, owner_attr) if owner_attr else self._env + rng = getattr(owner, rng_attr, None) + rng_seed = getattr(rng, "seed", None) + if callable(rng_seed): + rng_seed(seed) + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + from dm_control.rl.control import PhysicsError # type: ignore[import-untyped] + + self._ensure_env() + assert self._env is not None + + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) + + if self.action_mode not in ("eef", "joint", "delta_eef"): + raise ValueError(f"Unknown action_mode: {self.action_mode}") + + # Always refetch physics — dm_control returns a weakref proxy that can + # go stale across resets. + physics = self._env.physics + ctrl_dim = int(physics.data.ctrl.shape[0]) + ctrl = self._build_ctrl_from_action(action, ctrl_dim) + try: + timestep = self._env.step(ctrl) + except PhysicsError as exc: + # Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as + # a graceful failed termination rather than a hard crash — the + # rest of the multi-task eval should still run. + logger.warning( + "PhysicsError during step on task '%s': %s. Terminating episode.", + self.task, + exc, + ) + observation = self._get_obs() + info = {"task": self.task, "is_success": False, "physics_error": True} + # Drop the stale env so the next reset() rebuilds it cleanly. + with contextlib.suppress(Exception): + self._env.close() + self._env = None + return observation, 0.0, True, False, info + + # Extract reward from dm_control timestep + reward = float(timestep.reward) if timestep.reward is not None else 0.0 + + # Check success via the task's termination condition + is_success = False + if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"): + is_success = bool(self._env.task.should_terminate_episode(self._env.physics)) + + terminated = is_success + truncated = False + info = { + "task": self.task, + "is_success": is_success, + } + + observation = self._get_obs() + + if terminated: + self.reset() + + return observation, reward, terminated, truncated, info + + def render(self) -> np.ndarray: + self._ensure_env() + obs = self._get_obs() + return obs["pixels"]["image"] + + def close(self): + if self._env is not None: + self._env.close() + self._env = None + + +# ---- Main API ---------------------------------------------------------------- + + +def create_vlabench_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] | None = None, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, +) -> dict[str, dict[int, Any]]: + """ + Create vectorized VLABench environments with a consistent return shape. + + Returns: + dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + + Notes: + - n_envs is the number of rollouts *per task*. + - `task` can be a suite name ("primitive", "composite"), a comma-separated list of + suite names, or individual task names (e.g. "select_fruit,heat_food"). + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + gym_kwargs = dict(gym_kwargs or {}) + task_groups = [t.strip() for t in task.split(",") if t.strip()] + if not task_groups: + raise ValueError("`task` must contain at least one VLABench task or suite name.") + + logger.info( + "Creating VLABench envs | task_groups=%s | n_envs(per task)=%d", + task_groups, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space = None + cached_act_space = None + cached_metadata = None + out: dict[str, dict[int, Any]] = defaultdict(dict) + + for group in task_groups: + # Check if it's a suite name, otherwise treat as individual task + tasks = SUITE_TASKS.get(group, [group]) + + for tid, task_name in enumerate(tasks): + logger.info( + "Building vec env | group=%s | task_id=%d | task=%s", + group, + tid, + task_name, + ) + + fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)] + + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[group][tid] = lazy + else: + out[group][tid] = env_cls(fns) + + return {group: dict(task_map) for group, task_map in out.items()} diff --git a/tests/envs/test_robotwin.py b/tests/envs/test_robotwin.py new file mode 100644 index 000000000..fcd45adbf --- /dev/null +++ b/tests/envs/test_robotwin.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper. + +These tests mock out the SAPIEN-based RoboTwin runtime (task modules + +YAML config loader) so they run without the full RoboTwin installation +(SAPIEN, CuRobo, mplib, asset downloads, etc.). +""" + +from __future__ import annotations + +from contextlib import contextmanager +from unittest.mock import MagicMock, patch + +import gymnasium as gym +import numpy as np +import pytest + +from lerobot.envs.robotwin import ( + ACTION_DIM, + ROBOTWIN_CAMERA_NAMES, + ROBOTWIN_TASKS, + RoboTwinEnv, + create_robotwin_envs, +) + +# --------------------------------------------------------------------------- +# Fixtures / helpers +# --------------------------------------------------------------------------- + + +def _make_mock_task_env( + height: int = 240, + width: int = 320, + cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES, +) -> MagicMock: + """Return a mock that mimics the RoboTwin task class API. + + RoboTwin's real get_obs returns + {"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...} + so the mock follows the same nested shape. + """ + obs_dict = { + "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras}, + "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)}, + "endpose": {}, + } + + mock = MagicMock() + mock.get_obs.return_value = obs_dict + mock.setup_demo.return_value = None + mock.take_action.return_value = None + mock.eval_success = False + mock.check_success.return_value = False + mock.close_env.return_value = None + return mock + + +@contextmanager +def _patch_runtime(mock_task_instance: MagicMock): + """Patch both the task-class loader and the YAML config loader so the + env can construct + reset without a real RoboTwin install.""" + task_cls = MagicMock(return_value=mock_task_instance) + fake_setup = { + "head_camera_h": 240, + "head_camera_w": 320, + "left_embodiment_config": {}, + "right_embodiment_config": {}, + "left_robot_file": "", + "right_robot_file": "", + "dual_arm_embodied": True, + "render_freq": 0, + "task_name": "beat_block_hammer", + "task_config": "demo_clean", + } + with ( + patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls), + patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup), + ): + yield + + +# --------------------------------------------------------------------------- +# RoboTwinEnv unit tests +# --------------------------------------------------------------------------- + + +class TestRoboTwinEnv: + def test_observation_space_shape(self): + """observation_space should have the configured h×w×3 for every camera.""" + h, w = 240, 320 + env = RoboTwinEnv( + task_name="beat_block_hammer", + observation_height=h, + observation_width=w, + camera_names=["head_camera", "left_camera"], + ) + pixels_space = env.observation_space["pixels"] + assert pixels_space["head_camera"].shape == (h, w, 3) + assert pixels_space["left_camera"].shape == (h, w, 3) + assert "right_camera" not in pixels_space + + def test_action_space(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env.action_space.shape == (ACTION_DIM,) + assert env.action_space.dtype == np.float32 + + def test_reset_returns_correct_obs_keys(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + obs, info = env.reset() + + assert "pixels" in obs + for cam in ROBOTWIN_CAMERA_NAMES: + assert cam in obs["pixels"], f"Missing camera '{cam}' in obs" + assert "agent_pos" in obs + assert obs["agent_pos"].shape == (ACTION_DIM,) + assert info["is_success"] is False + + def test_reset_calls_setup_demo(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + env.reset(seed=42) + # setup_demo receives the full YAML-derived kwargs plus seed + is_test; + # we only assert the caller-provided bits. + assert mock_task.setup_demo.call_count == 1 + call_kwargs = mock_task.setup_demo.call_args.kwargs + assert call_kwargs["seed"] == 42 + assert call_kwargs["is_test"] is True + + def test_step_returns_correct_types(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + obs, reward, terminated, truncated, info = env.step(action) + + assert isinstance(obs, dict) + assert isinstance(reward, float) + assert isinstance(terminated, bool) + assert isinstance(truncated, bool) + assert isinstance(info, dict) + + def test_step_wrong_action_shape_raises(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + bad_action = np.zeros(7, dtype=np.float32) # wrong dim + with _patch_runtime(mock_task): + env.reset() + with pytest.raises(ValueError, match="Expected 1-D action"): + env.step(bad_action) + + def test_success_terminates_episode(self): + mock_task = _make_mock_task_env() + mock_task.check_success.return_value = True + env = RoboTwinEnv(task_name="beat_block_hammer") + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + _, _, terminated, _, info = env.step(action) + assert terminated is True + assert info["is_success"] is True + + def test_truncation_after_episode_length(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2) + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + env.step(action) # step 1 + _, _, _, truncated, _ = env.step(action) # step 2 → truncated + assert truncated is True + + def test_close_calls_close_env(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + env.reset() + env.close() + mock_task.close_env.assert_called_once() + + def test_black_frame_for_missing_camera(self): + """If a camera key is absent from get_obs(), a black frame is returned.""" + # Mock exposes only head_camera; we ask for both head_camera + left_camera. + mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",)) + env = RoboTwinEnv( + task_name="beat_block_hammer", + camera_names=["head_camera", "left_camera"], + observation_height=10, + observation_width=10, + ) + with _patch_runtime(mock_task): + obs, _ = env.reset() + assert obs["pixels"]["left_camera"].shape == (10, 10, 3) + assert obs["pixels"]["left_camera"].sum() == 0 + + def test_task_and_task_description_attributes(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env.task == "beat_block_hammer" + assert isinstance(env.task_description, str) + + def test_deferred_init_env_is_none_before_reset(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env._env is None # noqa: SLF001 (testing internal state) + + +# --------------------------------------------------------------------------- +# create_robotwin_envs tests +# --------------------------------------------------------------------------- + + +class TestCreateRoboTwinEnvs: + def test_returns_correct_structure(self): + mock_task = _make_mock_task_env() + with _patch_runtime(mock_task): + envs = create_robotwin_envs( + task="beat_block_hammer", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + assert "beat_block_hammer" in envs + assert 0 in envs["beat_block_hammer"] + assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv) + + def test_multi_task(self): + mock_task = _make_mock_task_env() + with _patch_runtime(mock_task): + envs = create_robotwin_envs( + task="beat_block_hammer,click_bell", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + assert set(envs.keys()) == {"beat_block_hammer", "click_bell"} + + def test_unknown_task_raises(self): + with pytest.raises(ValueError, match="Unknown RoboTwin tasks"): + create_robotwin_envs( + task="not_a_real_task", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + + def test_invalid_n_envs_raises(self): + with pytest.raises(ValueError, match="n_envs must be a positive int"): + create_robotwin_envs( + task="beat_block_hammer", + n_envs=0, + env_cls=gym.vector.SyncVectorEnv, + ) + + +# --------------------------------------------------------------------------- +# ROBOTWIN_TASKS list +# --------------------------------------------------------------------------- + + +def test_task_list_not_empty(): + assert len(ROBOTWIN_TASKS) >= 50 + + +def test_all_tasks_are_strings(): + assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS) + + +def test_no_duplicate_tasks(): + assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS)) diff --git a/tests/test_robomme_env.py b/tests/test_robomme_env.py new file mode 100644 index 000000000..20646430a --- /dev/null +++ b/tests/test_robomme_env.py @@ -0,0 +1,232 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the RoboMME env wrapper and config. + +RoboMME requires Linux + ManiSkill (Vulkan/SAPIEN), so tests that touch the +env wrapper mock the ``robomme`` package. Tests that only exercise the +dataclass config run without any mocking. +""" + +from __future__ import annotations + +import sys +from types import ModuleType +from unittest.mock import MagicMock + +import numpy as np + + +def _install_robomme_stub(): + """Register a minimal stub for the ``robomme`` package on sys.modules.""" + stub = ModuleType("robomme") + wrapper_stub = ModuleType("robomme.env_record_wrapper") + + class FakeBuilder: + def __init__(self, **kwargs): + pass + + def make_env_for_episode(self, episode_idx: int, max_steps: int): + env = MagicMock() + obs = { + "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "joint_state_list": [np.zeros(7, dtype=np.float32)], + "gripper_state_list": [np.zeros(2, dtype=np.float32)], + } + env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"}) + env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""}) + return env + + wrapper_stub.BenchmarkEnvBuilder = FakeBuilder + stub.env_record_wrapper = wrapper_stub + sys.modules["robomme"] = stub + sys.modules["robomme.env_record_wrapper"] = wrapper_stub + + +def _uninstall_robomme_stub(): + sys.modules.pop("robomme", None) + sys.modules.pop("robomme.env_record_wrapper", None) + + +# --------------------------------------------------------------------------- +# Config tests (no sim required) +# --------------------------------------------------------------------------- + + +def test_robomme_env_config_defaults(): + from lerobot.envs.configs import RoboMMEEnv + + cfg = RoboMMEEnv() + assert cfg.task == "PickXtimes" + assert cfg.fps == 10 + assert cfg.episode_length == 300 + assert cfg.action_space == "joint_angle" + assert cfg.dataset_split == "test" + assert cfg.task_ids is None + + +def test_robomme_env_config_type(): + from lerobot.envs.configs import RoboMMEEnv + + cfg = RoboMMEEnv() + assert cfg.type == "robomme" + + +def test_robomme_features_map(): + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE + + cfg = RoboMMEEnv() + assert cfg.features_map[ACTION] == ACTION + assert cfg.features_map["pixels/image"] == f"{OBS_IMAGES}.image" + assert cfg.features_map["pixels/wrist_image"] == f"{OBS_IMAGES}.wrist_image" + assert cfg.features_map["agent_pos"] == OBS_STATE + + +def test_robomme_features_action_dim_joint_angle(): + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION + + cfg = RoboMMEEnv(action_space="joint_angle") + assert cfg.features[ACTION].shape == (8,) + + +def test_robomme_features_action_dim_ee_pose(): + """`ee_pose` uses a 7-D action; __post_init__ sets the correct shape.""" + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION + + cfg = RoboMMEEnv(action_space="ee_pose") + assert cfg.features[ACTION].shape == (7,) + + +# --------------------------------------------------------------------------- +# Obs conversion (pure Python, no sim) +# --------------------------------------------------------------------------- + + +def test_convert_obs_list_format(): + """_convert_obs takes the last element from list-format obs fields and + emits a nested ``pixels`` dict (image, wrist_image) plus ``agent_pos``. + + The nested layout is required so ``preprocess_observation()`` in + ``envs/utils.py`` maps each camera to ``observation.images.``. + """ + _install_robomme_stub() + try: + from lerobot.envs.robomme import RoboMMEGymEnv + + env = RoboMMEGymEnv.__new__(RoboMMEGymEnv) + + front = np.full((256, 256, 3), 42, dtype=np.uint8) + wrist = np.full((256, 256, 3), 7, dtype=np.uint8) + joints = np.arange(7, dtype=np.float32) + gripper = np.array([0.5, 0.5], dtype=np.float32) + + obs_raw = { + "front_rgb_list": [np.zeros_like(front), front], + "wrist_rgb_list": [np.zeros_like(wrist), wrist], + "joint_state_list": [np.zeros(7, dtype=np.float32), joints], + "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper], + } + + result = env._convert_obs(obs_raw) + np.testing.assert_array_equal(result["pixels"]["image"], front) + np.testing.assert_array_equal(result["pixels"]["wrist_image"], wrist) + assert result["agent_pos"].shape == (8,) + np.testing.assert_array_almost_equal(result["agent_pos"][:7], joints) + assert result["agent_pos"][7] == gripper[0] + finally: + _uninstall_robomme_stub() + + +def test_convert_obs_array_format(): + """_convert_obs also handles non-list (direct array) obs.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import RoboMMEGymEnv + + env = RoboMMEGymEnv.__new__(RoboMMEGymEnv) + + front = np.zeros((256, 256, 3), dtype=np.uint8) + obs_raw = { + "front_rgb_list": front, + "wrist_rgb_list": front, + "joint_state_list": np.zeros(7, dtype=np.float32), + "gripper_state_list": np.zeros(2, dtype=np.float32), + } + result = env._convert_obs(obs_raw) + assert result["pixels"]["image"].shape == (256, 256, 3) + assert result["pixels"]["wrist_image"].shape == (256, 256, 3) + assert result["agent_pos"].shape == (8,) + finally: + _uninstall_robomme_stub() + + +# --------------------------------------------------------------------------- +# create_robomme_envs (mocked sim) +# --------------------------------------------------------------------------- + + +def test_create_robomme_envs_returns_correct_structure(): + """Single task -> {task_name: {task_id: VectorEnv}} with one entry per task_id.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import create_robomme_envs + + env_cls = MagicMock(return_value=MagicMock()) + result = create_robomme_envs( + task="PickXtimes", + n_envs=1, + task_ids=[0, 1], + env_cls=env_cls, + ) + + assert "PickXtimes" in result + assert 0 in result["PickXtimes"] + assert 1 in result["PickXtimes"] + assert env_cls.call_count == 2 + finally: + _uninstall_robomme_stub() + + +def test_create_robomme_envs_multi_task(): + """Comma-separated task list produces one suite per task.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import create_robomme_envs + + env_cls = MagicMock(return_value=MagicMock()) + result = create_robomme_envs( + task="PickXtimes,BinFill,StopCube", + n_envs=1, + env_cls=env_cls, + ) + + assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"} + finally: + _uninstall_robomme_stub() + + +def test_create_robomme_envs_raises_on_invalid_env_cls(): + _install_robomme_stub() + try: + import pytest + + from lerobot.envs.robomme import create_robomme_envs + + with pytest.raises(ValueError, match="env_cls must be a callable"): + create_robomme_envs(task="PickXtimes", n_envs=1, env_cls=None) + finally: + _uninstall_robomme_stub()