fix(robotwin): pin compatible curobo in benchmark image

Merge remote-tracking branch 'origin/feat/robotwin-benchmark' into feat/robotwin-benchmark
Merge branch 'main' into feat/robotwin-benchmark
2026-05-12 15:19:43 +00:00 · 2026-04-21 18:41:16 +02:00 · 2026-04-20 17:31:28 +02:00 · 2026-04-20 17:17:00 +02:00 · 2026-04-20 17:10:53 +02:00 · 2026-04-20 15:33:13 +02:00
80 changed files with 4761 additions and 528 deletions
@@ -2,11 +2,6 @@

 Short, imperative summary (e.g., "fix(robots): handle None in sensor parser"). See [CONTRIBUTING.md](../CONTRIBUTING.md) for PR conventions.

-## Type / Scope
-
- **Type**: (Bug | Feature | Docs | Performance | Test | CI | Chore)
- **Scope**: (optional — name of module or package affected)
-
 ## Summary / Motivation

 - One-paragraph description of what changes and why.
@@ -19,28 +14,14 @@ Short, imperative summary (e.g., "fix(robots): handle None in sensor parser"). S

 ## What changed

- Short, concrete bullets of the modifications (files/behaviour).
+- Short, concrete bullets explaining the functional changes (how the behavior or output differs now).
 - Short note if this introduces breaking changes and migration steps.

 ## How was this tested (or how to run locally)

- Tests added: list new tests or test files.
+- Tests added: list new tests or test files. `pytest -q tests/ -k <keyword>`
 - Manual checks / dataset runs performed.
- Instructions for the reviewer
-
-Example:
-
- Ran the relevant tests:
-
-  ```bash
-  pytest -q tests/ -k <keyword>
-  ```
-
- Reproduce with a quick example or CLI (if applicable):
-
-  ```bash
-  lerobot-train --some.option=true
-  ```
+- Instructions for the reviewer for reproducing with a quick example or CLI (if applicable)

 ## Checklist (required before merge)

@@ -48,6 +29,7 @@ Example:
 - [ ] All tests pass locally (`pytest`)
 - [ ] Documentation updated
 - [ ] CI is green
+- [ ] Community Review: I have reviewed another contributor's open PR and linked it here: # (insert PR number/link)

 ## Reviewer notes

@@ -0,0 +1,527 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Integration tests: build an isolated Docker image per benchmark and run a
+# 1-episode smoke eval. Each benchmark gets its own image so incompatible
+# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide.
+#
+# To add a new benchmark:
+#   1. Add docker/Dockerfile.benchmark.<name>  (install only lerobot[<name>])
+#   2. Copy one of the jobs below and adjust the image name and eval command.
+name: Benchmark Integration Tests
+
+on:
+  # Run manually from the Actions tab
+  workflow_dispatch:
+
+  # Run every Monday at 02:00 UTC.
+  schedule:
+    - cron: "0 2 * * 1"
+
+  push:
+    branches:
+      - main
+    paths:
+      - "src/lerobot/envs/**"
+      - "src/lerobot/scripts/lerobot_eval.py"
+      - "docker/Dockerfile.benchmark.*"
+      - ".github/workflows/benchmark_tests.yml"
+      - "pyproject.toml"
+
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/lerobot/envs/**"
+      - "src/lerobot/scripts/lerobot_eval.py"
+      - "docker/Dockerfile.benchmark.*"
+      - ".github/workflows/benchmark_tests.yml"
+      - "pyproject.toml"
+
+permissions:
+  contents: read
+
+env:
+  UV_VERSION: "0.8.0"
+  PYTHON_VERSION: "3.12"
+
+# Cancel in-flight runs for the same branch/PR.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  # ── LIBERO ────────────────────────────────────────────────────────────────
+  # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain)
+  libero-integration-test:
+    name: Libero — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      # Build the benchmark-specific image. The Dockerfile separates dep-install
+      # from source-copy, so code-only changes skip the slow uv-sync layer
+      # when the runner has a warm Docker daemon cache.
+      - name: Build Libero benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.libero
+          push: false
+          load: true
+          tags: lerobot-benchmark-libero:ci
+
+      - name: Run Libero smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          # Named container (no --rm) so we can docker cp artifacts out.
+          # Output to /tmp inside the container — /artifacts doesn't exist
+          # and user_lerobot cannot create root-level dirs.
+          docker run --name libero-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-libero:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_libero \
+                --env.type=libero \
+                --env.task=libero_spatial \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero --task libero_spatial \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy Libero artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/libero-artifacts
+          docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true
+          docker rm -f libero-eval || true
+
+      - name: Parse Libero eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/libero-artifacts \
+            --env libero \
+            --task libero_spatial \
+            --policy lerobot/smolvla_libero
+
+      - name: Upload Libero rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-rollout-video
+          path: /tmp/libero-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload Libero eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-metrics
+          path: /tmp/libero-artifacts/metrics.json
+          if-no-files-found: warn
+
+      # ── LIBERO TRAIN+EVAL SMOKE ──────────────────────────────────────────────
+      # Train SmolVLA for 1 step (batch_size=1, dataset episode 0 only) then
+      # immediately runs eval inside the training loop (eval_freq=1, 1 episode).
+      # Tests the full train→eval-within-training pipeline end-to-end.
+      - name: Run Libero train+eval smoke (1 step, eval_freq=1)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name libero-train-smoke --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-libero:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              accelerate launch --num_processes=1 \$(which lerobot-train) \
+                --policy.path=lerobot/smolvla_base \
+                --policy.load_vlm_weights=true \
+                --policy.scheduler_decay_steps=25000 \
+                --policy.freeze_vision_encoder=false \
+                --policy.train_expert_only=false \
+                --dataset.repo_id=lerobot/libero \
+                --dataset.episodes=[0] \
+                --dataset.use_imagenet_stats=false \
+                --env.type=libero \
+                --env.task=libero_spatial \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/train-smoke \
+                --steps=1 \
+                --batch_size=1 \
+                --eval_freq=1 \
+                --eval.n_episodes=1 \
+                --eval.batch_size=1 \
+                --eval.use_async_envs=false \
+                --save_freq=1 \
+                --policy.push_to_hub=false \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.image2\": \"observation.images.camera2\"}'
+            "
+
+      - name: Copy Libero train-smoke artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/libero-train-smoke-artifacts
+          docker cp libero-train-smoke:/tmp/train-smoke/. /tmp/libero-train-smoke-artifacts/ 2>/dev/null || true
+          docker rm -f libero-train-smoke || true
+
+      - name: Upload Libero train-smoke eval video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-train-smoke-video
+          path: /tmp/libero-train-smoke-artifacts/eval/
+          if-no-files-found: warn
+
+  # ── METAWORLD ─────────────────────────────────────────────────────────────
+  # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
+  metaworld-integration-test:
+    name: MetaWorld — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build MetaWorld benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.metaworld
+          push: false
+          load: true
+          tags: lerobot-benchmark-metaworld:ci
+
+      - name: Run MetaWorld smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name metaworld-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-metaworld:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_metaworld \
+                --env.type=metaworld \
+                --env.task=metaworld-push-v3 \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.image\": \"observation.images.camera1\"}' \
+                --policy.empty_cameras=2 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env metaworld --task metaworld-push-v3 \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy MetaWorld artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/metaworld-artifacts
+          docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true
+          docker rm -f metaworld-eval || true
+
+      - name: Parse MetaWorld eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/metaworld-artifacts \
+            --env metaworld \
+            --task metaworld-push-v3 \
+            --policy lerobot/smolvla_metaworld
+
+      - name: Upload MetaWorld rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: metaworld-rollout-video
+          path: /tmp/metaworld-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload MetaWorld eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: metaworld-metrics
+          path: /tmp/metaworld-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
+  # Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
+  # pytorch3d, + simulation assets (~4 GB).
+  # Build takes ~20 min on first run; subsequent runs hit the layer cache.
+  # Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
+  robotwin-integration-test:
+    name: RoboTwin 2.0 — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOTWIN_POLICY: lerobot/smolvla_robotwin
+      ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      # Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
+      # simulation assets (~4 GB). Layer cache lives in the runner's local
+      # Docker daemon — reused across re-runs on the same machine.
+      - name: Build RoboTwin 2.0 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robotwin
+          push: false
+          load: true
+          tags: lerobot-benchmark-robotwin:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robotwin
+          cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
+
+      - name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          # Named container (no --rm) so we can docker cp artifacts out.
+          docker run --name robotwin-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
+            -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
+            lerobot-benchmark-robotwin:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              cd /opt/robotwin && lerobot-eval \
+                --policy.path=\"\$ROBOTWIN_POLICY\" \
+                --env.type=robotwin \
+                --env.task=\"\$ROBOTWIN_TASKS\" \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python /lerobot/scripts/ci/extract_task_descriptions.py \
+                --env robotwin \
+                --task \"\$ROBOTWIN_TASKS\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboTwin artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robotwin-artifacts
+          docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
+          docker rm -f robotwin-eval || true
+
+      - name: Parse RoboTwin eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robotwin-artifacts \
+            --env robotwin \
+            --task "${ROBOTWIN_TASKS}" \
+            --policy "${ROBOTWIN_POLICY}"
+
+      - name: Upload RoboTwin rollout video
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-rollout-video
+          path: /tmp/robotwin-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboTwin eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-metrics
+          path: /tmp/robotwin-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOCASA365 ──────────────────────────────────────────────────────────
+  # Isolated image: robocasa + robosuite installed manually as editable
+  # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
+  # `lerobot==0.3.3`, which would shadow this repo's lerobot).
+  robocasa-integration-test:
+    name: RoboCasa365 — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboCasa365 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocasa
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocasa:ci
+
+      - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robocasa-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e MUJOCO_GL=egl \
+            lerobot-benchmark-robocasa:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robocasa \
+                --env.type=robocasa \
+                --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robocasa \
+                --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboCasa365 artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocasa-artifacts
+          docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
+          docker rm -f robocasa-eval || true
+
+      - name: Parse RoboCasa365 eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocasa-artifacts \
+            --env robocasa \
+            --task atomic_smoke_10 \
+            --policy lerobot/smolvla_robocasa
+
+      - name: Upload RoboCasa365 rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-rollout-video
+          path: /tmp/robocasa-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCasa365 eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-metrics
+          path: /tmp/robocasa-artifacts/metrics.json
+          if-no-files-found: warn
@@ -33,7 +33,7 @@ jobs:
      github.event.workflow_run.event == 'pull_request' &&
      github.event.workflow_run.conclusion == 'success' &&
      github.repository == 'huggingface/lerobot'
-    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6  # main
    with:
      package_name: lerobot
    secrets:
@@ -217,6 +217,24 @@ jobs:
      - name: Run end-to-end tests
        run: make test-end-to-end

+  slack-notification:
+    name: Slack Notification
+    needs: [cpu-tests, gpu-tests, upgrade-lock]
+    if: always() && needs.upgrade-lock.outputs.changed == 'true'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    env:
+      CI_SLACK_CHANNEL: ${{ secrets.CI_SLACK_CHANNEL }}
+    steps:
+      - name: Post to a Slack channel
+        uses: huggingface/hf-workflows/.github/actions/post-slack@a88e7fa2eaee28de5a4d6142381b1fb792349b67  # main
+        with:
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: "Results of the latest dependency tests (CPU + GPU)"
+          status: ${{ (needs.cpu-tests.result == 'success' && needs.gpu-tests.result == 'success') && 'success' || 'failure' }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  # This job creates or updates a PR with the upgraded lockfile
  open-pr:
    name: Open PR
@@ -78,6 +78,9 @@ Use the templates for required fields and examples.
 - **Issues:** Follow the [ticket template](https://github.com/huggingface/lerobot/blob/main/.github/ISSUE_TEMPLATE/bug-report.yml).
 - **Pull requests:** Rebase on `upstream/main`, use a descriptive branch (don't work on `main`), run `pre-commit` and tests locally, and follow the [PR template](https://github.com/huggingface/lerobot/blob/main/.github/PULL_REQUEST_TEMPLATE.md).

-One member of the LeRobot team will then review your contribution.
+> [!IMPORTANT]
+> Community Review Policy: To help scale our efforts and foster a collaborative environment, we ask contributors to review at least one other person's open PR before their own receives attention. This shared responsibility multiplies our review capacity and helps everyone's code get merged faster!
+
+Once you have submitted your PR and completed a peer review, a member of the LeRobot team will review your contribution.

 Thank you for contributing to LeRobot!
@@ -0,0 +1,42 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for LIBERO integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code and LIBERO-specific asset setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.libero -t lerobot-benchmark-libero .
+# Run:    docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
+# runtime (which times out on CI). Point the libero config at the cached path.
+# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
+# so we write the config before any libero import can happen.
+RUN LIBERO_DIR=$(python -c \
+      "import importlib.util, os; s=importlib.util.find_spec('libero'); \
+       print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
+    mkdir -p /home/user_lerobot/.libero && \
+    python -c "\
+from huggingface_hub import snapshot_download; \
+snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
+                  local_dir='/home/user_lerobot/.libero/assets')" && \
+    printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
+    > /home/user_lerobot/.libero/config.yaml
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for MetaWorld integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.metaworld -t lerobot-benchmark-metaworld .
+# Run:    docker run --gpus all --rm lerobot-benchmark-metaworld lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,71 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCasa365 integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code and RoboCasa-specific asset setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install robocasa + robosuite as editable clones. pip-installing from git
+# omits data files like robocasa/models/assets/box_links/box_links_assets.json
+# (not declared in package_data), which download_kitchen_assets needs at import.
+#
+# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
+# in install_requires, which would shadow the editable lerobot baked into
+# this image. We install robocasa's actual runtime deps explicitly instead.
+# Pinned SHAs for reproducible benchmark runs. Bump when you need an
+# upstream fix; don't rely on `main`/`master` drift.
+ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
+ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
+RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
+    git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
+    git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
+    git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
+    uv pip install --no-cache -e ~/robocasa --no-deps && \
+    uv pip install --no-cache -e ~/robosuite && \
+    uv pip install --no-cache \
+      "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
+      "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
+      "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
+      "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
+      "tianshou==0.4.10" "gymnasium==1.2.3"
+
+# Set up robocasa macros and download kitchen assets. We need:
+#   - tex              : base environment textures
+#   - tex_generative   : AI-generated textures; kitchen fixture XMLs embed
+#                        refs to generative_textures/wall/tex*.png
+#                        unconditionally, so MjModel.from_xml_string fails
+#                        at reset time without them (even if the env is
+#                        constructed with generative_textures=None).
+#   - fixtures_lw      : lightwheel kitchen fixtures (fridge, counters...)
+#   - objs_lw          : lightwheel object meshes (stools, misc props)
+# We skip the objaverse/aigen object packs (~30GB combined) by pairing
+# this with --env.obj_registries=["lightwheel"] on the lerobot side.
+# The download script prompts interactively, so pipe 'y' to auto-accept.
+RUN python -m robocasa.scripts.setup_macros && \
+    yes y | python -m robocasa.scripts.download_kitchen_assets \
+      --type tex tex_generative fixtures_lw objs_lw
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
+# replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,131 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboTwin 2.0 integration tests.
+# Extends the nightly GPU image with the RoboTwin simulator stack:
+#   sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
+# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
+#
+# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
+# Run:   docker run --gpus all --rm lerobot-benchmark-robotwin \
+#            lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
+
+FROM huggingface/lerobot-gpu:latest
+
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
+    ROBOTWIN_ROOT=/opt/robotwin
+
+# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
+# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
+USER root
+# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
+# an upstream fix; don't rely on `main` drift.
+ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
+         libvulkan1 vulkan-tools \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
+    && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+# RoboTwin runtime deps (av is already in the base via [av-dep]).
+RUN uv pip install --no-cache \
+        "sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
+        "open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
+
+# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
+RUN uv pip install --no-cache --no-build-isolation \
+        "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+
+# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
+# build aborts on an empty arch list. RoboTwin's own installer pins v0.7.8,
+# which still exposes the v1 API (`curobo.types.math`) that RoboTwin imports.
+ARG CUROBO_REF=v0.7.8
+RUN cd ${ROBOTWIN_ROOT}/envs \
+    && git clone --branch ${CUROBO_REF} --depth 1 https://github.com/NVlabs/curobo.git \
+    && cd curobo \
+    && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
+       uv pip install -e . --no-build-isolation --no-cache
+
+# Upstream patches (mirror RoboTwin's script/_install.sh).
+# These patches target the exact versions pinned above; re-check when upgrading.
+# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
+#   Safe to remove once mplib > 0.2.1 ships with the fix upstream.
+# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
+#   Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
+RUN python - <<'EOF'
+import pathlib, re, site
+for d in site.getsitepackages():
+    p = pathlib.Path(d) / "mplib" / "planner.py"
+    if p.exists():
+        p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
+        print(f"mplib patch applied: {p}")
+    p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
+    if p.exists():
+        src = p.read_text().replace(
+            "with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
+        ).replace('"srdf"', '".srdf"')
+        p.write_text(src)
+        print(f"sapien patch applied: {p}")
+EOF
+
+# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
+# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
+# The dataset is public — no auth token needed.
+RUN python - <<'EOF'
+import os, pathlib, zipfile
+from huggingface_hub import hf_hub_download
+
+assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
+assets_dir.mkdir(parents=True, exist_ok=True)
+for fname in ("embodiments.zip", "objects.zip"):
+    local = hf_hub_download(
+        repo_id="TianxingChen/RoboTwin2.0",
+        repo_type="dataset",
+        filename=fname,
+        local_dir=str(assets_dir),
+    )
+    with zipfile.ZipFile(local, "r") as z:
+        z.extractall(str(assets_dir))
+    pathlib.Path(local).unlink()
+EOF
+
+WORKDIR ${ROBOTWIN_ROOT}
+RUN python script/update_embodiment_config_path.py
+
+ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}"
+
+# Fail the image build early if the CuRobo/RoboTwin import chain regresses.
+RUN python - <<'EOF'
+from curobo.types.math import Pose
+from envs.robot.planner import CuroboPlanner
+
+print("CuRobo import OK:", Pose.__name__)
+print("RoboTwin planner import OK:", CuroboPlanner.__name__)
+EOF
+
+# Return to the lerobot source directory (set by base image) before overlaying.
+WORKDIR /lerobot
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -79,6 +79,10 @@
    title: LIBERO
  - local: metaworld
    title: Meta-World
+  - local: robotwin
+    title: RoboTwin 2.0
+  - local: robocasa
+    title: RoboCasa365
  - local: envhub_isaaclab_arena
    title: NVIDIA IsaacLab Arena Environments
  title: "Benchmarks"
@@ -685,6 +685,10 @@ Example configuration for training the [reward classifier](https://huggingface.c

 ```json
 {
+  "dataset": {
+    "repo_id": "hf_username/dataset_name",
+    "root": null
+  },
  "policy": {
    "type": "reward_classifier",
    "model_name": "helper2424/resnet10",
@@ -705,8 +709,28 @@ Example configuration for training the [reward classifier](https://huggingface.c
        "type": "VISUAL",
        "shape": [3, 128, 128]
      }
-    }
-  }
+    },
+    "push_to_hub": true,
+    "repo_id": "hf_username/model_repo"
+  },
+  "batch_size": 16,
+  "num_workers": 4,
+  "steps": 5000,
+  "log_freq": 10,
+  "eval_freq": 1000,
+  "save_freq": 1000,
+  "save_checkpoint": true,
+  "seed": 2,
+  "resume": false,
+  "optimizer": {
+    "grad_clip_norm": 10.0
+  },
+  "wandb": {
+    "enable": true,
+    "project": "reward-classifier",
+    "disable_artifact": false
+  },
+  "job_name": "reward-classifier"
 }
 ```

@@ -32,6 +32,12 @@ Once you’ve gathered enough trajectories, you’ll train a neural network to i

 If you run into any issues at any point, jump into our [Discord community](https://discord.com/invite/s3KuuzsPFb) for support.

+<Tip>
+
+Want to quickly get the right commands for your setup? The [quickstart notebook](https://github.com/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb) lets you configure your robot once and generates all the commands below ready to paste.
+
+</Tip>
+
 ## Set up and Calibrate

 If you haven't yet set up and calibrated your robot and teleop device, please do so by following the robot-specific tutorial.
@@ -0,0 +1,188 @@
+# RoboCasa365
+
+[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
+
+- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
+- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
+- Project website: [robocasa.ai](https://robocasa.ai)
+- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
+- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
+
+<img
+  src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
+  alt="RoboCasa365 benchmark overview"
+  width="85%"
+/>
+
+## Available tasks
+
+RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
+
+| Family    | Tasks | Description                                                                     |
+| --------- | ----- | ------------------------------------------------------------------------------- |
+| Atomic    | ~65   | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
+| Composite | ~300  | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc.     |
+
+**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
+
+**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
+
+`--env.task` accepts three forms:
+
+- a single task name (`CloseFridge`)
+- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
+- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
+
+## Installation
+
+RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/robocasa/robocasa.git ~/robocasa
+git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
+pip install -e ~/robocasa --no-deps
+pip install -e ~/robosuite
+
+# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
+# the bad lerobot pin).
+pip install numpy numba scipy mujoco pygame Pillow opencv-python \
+            pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
+            tianshou gymnasium
+
+python -m robocasa.scripts.setup_macros
+# Lightweight assets (lightwheel object meshes + textures). Enough for
+# the default env out of the box.
+python -m robocasa.scripts.download_kitchen_assets \
+  --type tex tex_generative fixtures_lw objs_lw
+# Optional: full objaverse/aigen registries (~30GB) for richer object
+# variety. Enable at eval time via --env.obj_registries (see below).
+# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
+```
+
+<Tip>
+RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+### Object registries
+
+By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
+
+```bash
+--env.obj_registries='[objaverse,lightwheel]'
+```
+
+## Evaluation
+
+All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Benchmark-group evaluation
+
+Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=atomic_seen \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
+
+## Policy inputs and outputs
+
+**Observations** (raw RoboCasa camera names are preserved verbatim):
+
+- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
+- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
+- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
+- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
+
+## Training
+
+### Single-task example
+
+A ready-to-use single-task dataset is on the Hub:
+[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
+
+Fine-tune a SmolVLA base on `CloseFridge`:
+
+```bash
+lerobot-train \
+  --policy.type=smolvla \
+  --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --policy.load_vlm_weights=true \
+  --policy.push_to_hub=true \
+  --dataset.repo_id=pepijn223/robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --output_dir=./outputs/smolvla_robocasa_CloseFridge \
+  --steps=100000 \
+  --batch_size=4 \
+  --eval_freq=5000 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5 \
+  --save_freq=10000
+```
+
+Evaluate the resulting checkpoint:
+
+```bash
+lerobot-eval \
+  --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
@@ -0,0 +1,223 @@
+# RoboTwin 2.0
+
+RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
+
+- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
+- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
+- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
+- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
+
+![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png)
+
+## Overview
+
+| Property      | Value                                                    |
+| ------------- | -------------------------------------------------------- |
+| Tasks         | 50 dual-arm manipulation tasks                           |
+| Robot         | Aloha-AgileX bimanual (14 DOF, 7 per arm)                |
+| Action space  | 14-dim joint-space, continuous in `[-1, 1]`              |
+| Cameras       | `head_camera`, `left_camera`, `right_camera`             |
+| Simulator     | SAPIEN (not MuJoCo)                                      |
+| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations          |
+| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
+
+## Available tasks
+
+RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
+
+| Task                     | CLI name                 | Category          |
+| ------------------------ | ------------------------ | ----------------- |
+| Beat block with hammer   | `beat_block_hammer`      | Tool use          |
+| Click bell / alarm clock | `click_bell`             | Precision press   |
+| Stack blocks (2 / 3)     | `stack_blocks_two/three` | Stacking          |
+| Stack bowls (2 / 3)      | `stack_bowls_two/three`  | Stacking          |
+| Handover block / mic     | `handover_block`         | Bimanual coord.   |
+| Lift pot                 | `lift_pot`               | Bimanual lift     |
+| Shake bottle             | `shake_bottle`           | Continuous motion |
+| Turn switch              | `turn_switch`            | Articulated obj   |
+| Stamp seal               | `stamp_seal`             | Precision place   |
+| Scan object              | `scan_object`            | Mobile manip.     |
+
+Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
+
+<Tip warning={true}>
+  `open_laptop` is currently broken upstream (its `check_success()` uses
+  `self.arm_tag`, which is only set inside the scripted-expert `play_once()`
+  path and therefore unavailable during normal policy eval). Avoid it until the
+  upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
+  `load_actors()`.
+</Tip>
+
+## Dataset
+
+The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
+
+```
+lerobot/robotwin_unified
+```
+
+It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
+
+You can load it directly with the HF Datasets library:
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("lerobot/robotwin_unified", split="train")
+```
+
+## Installation
+
+RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
+
+### 1. Create a conda environment
+
+```bash
+conda create -n robotwin python=3.10 -y
+conda activate robotwin
+```
+
+### 2. Install LeRobot
+
+```bash
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
+pip install -e "."
+```
+
+### 3. Install RoboTwin 2.0
+
+```bash
+git clone https://github.com/RoboTwin-Platform/RoboTwin.git
+cd RoboTwin
+bash script/_install.sh
+bash script/_download_assets.sh
+```
+
+The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
+
+<Tip warning={true}>
+If the automated install fails, install manually:
+
+```bash
+pip install -r requirements.txt
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
+pip install -e . --no-build-isolation
+```
+
+Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
+
+</Tip>
+
+### 4. Add RoboTwin to PYTHONPATH
+
+The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
+
+```bash
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+```
+
+Add this to your shell profile to make it permanent.
+
+## Evaluation
+
+### Standard evaluation (recommended)
+
+Evaluate a policy on a single task with the official protocol (100 episodes):
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Single-task quick check
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5
+```
+
+### Multi-task sweep
+
+Evaluate on several tasks in one run:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Full benchmark (all 50 tasks)
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+<Tip>
+  `open_laptop` is intentionally omitted above because of the upstream
+  `self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
+  upstream fix lands.
+</Tip>
+
+## Camera configuration
+
+By default, all three cameras are included:
+
+| Camera key     | Description                    |
+| -------------- | ------------------------------ |
+| `head_camera`  | Torso-mounted overhead view    |
+| `left_camera`  | Left arm wrist-mounted camera  |
+| `right_camera` | Right arm wrist-mounted camera |
+
+To use a subset of cameras, override `--env.camera_names`:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --env.camera_names="head_camera,left_camera" \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+## Environment config reference
+
+Key parameters for `RoboTwinEnvConfig`:
+
+| Parameter            | Default                                  | Description                        |
+| -------------------- | ---------------------------------------- | ---------------------------------- |
+| `task`               | `"beat_block_hammer"`                    | Comma-separated task name(s)       |
+| `fps`                | `25`                                     | Simulation FPS                     |
+| `episode_length`     | `300`                                    | Max steps per episode              |
+| `obs_type`           | `"pixels_agent_pos"`                     | `"pixels"` or `"pixels_agent_pos"` |
+| `camera_names`       | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras     |
+| `observation_height` | `240`                                    | Camera pixel height                |
+| `observation_width`  | `320`                                    | Camera pixel width                 |
+
+## Leaderboard submission
+
+Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
+
+- Training on 50 `demo_clean` demonstrations per task
+- Evaluating 100 episodes per task
+- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
+
+For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗 LeRobot Quickstart\n",
+    "\n",
+    "Calibration → teleoperation → data collection → training → evaluation.\n",
+    "\n",
+    "Install the required dependencies: `pip install -e .[notebook,dataset,training,viz,hardware]`.\n",
+    "\n",
+    "**How to use:**\n",
+    "1. Edit the **Configuration** cell with your settings.\n",
+    "2. Run all cells (`Run All`).\n",
+    "3. Each section prints a ready-to-paste terminal command - copy it and run it.\n",
+    "\n",
+    "Each setup is different, please refer to the [LeRobot documentation](https://huggingface.co/docs/lerobot/il_robots) for more details on each step and available options. <br>\n",
+    "Feel free to make this notebook your own and adapt it to your needs!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _cameras_arg(cameras: dict) -> str:\n",
+    "    if not cameras:\n",
+    "        return \"\"\n",
+    "    entries = [f\"{n}: {{{', '.join(f'{k}: {v}' for k, v in cfg.items())}}}\" for n, cfg in cameras.items()]\n",
+    "    return \"{ \" + \", \".join(entries) + \" }\"\n",
+    "\n",
+    "\n",
+    "def print_cmd(*parts: str) -> None:\n",
+    "    \"\"\"Print a shell command with line continuations, skipping empty parts.\"\"\"\n",
+    "    non_empty = [p for p in parts if p]\n",
+    "    print(\" \\\\\\n    \".join(non_empty))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Configuration\n",
+    "\n",
+    "Edit this cell, then **Run All** to generate all commands below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Robot (follower) - run `lerobot-find-port` to discover the port\n",
+    "ROBOT_TYPE = \"so101_follower\"\n",
+    "ROBOT_PORT = \"/dev/ttyACM0\"\n",
+    "ROBOT_ID = \"my_follower_arm\"\n",
+    "\n",
+    "# Teleop (leader) - run `lerobot-find-port` to discover the port\n",
+    "TELEOP_TYPE = \"so101_leader\"\n",
+    "TELEOP_PORT = \"/dev/ttyACM1\"\n",
+    "TELEOP_ID = \"my_leader_arm\"\n",
+    "\n",
+    "# Cameras - set to {} to disable\n",
+    "# Run `lerobot-find-cameras opencv` to list available cameras and their indices\n",
+    "CAMERAS = {\n",
+    "    \"top\": {\"type\": \"opencv\", \"index_or_path\": 2, \"width\": 640, \"height\": 480, \"fps\": 30},\n",
+    "    \"wrist\": {\"type\": \"opencv\", \"index_or_path\": 4, \"width\": 640, \"height\": 480, \"fps\": 30},\n",
+    "}\n",
+    "\n",
+    "# Dataset\n",
+    "HF_USER = \"your_hf_username\"  # `huggingface-cli whoami` to find your username\n",
+    "DATASET_NAME = \"my_so101_dataset\"\n",
+    "TASK_DESCRIPTION = \"pick and place the block\"\n",
+    "NUM_EPISODES = 10\n",
+    "\n",
+    "# Training\n",
+    "POLICY_TYPE = \"act\"  # act, diffusion, smolvla, ...\n",
+    "POLICY_DEVICE = \"cuda\"  # cuda / cpu / mps\n",
+    "TRAIN_STEPS = 10_000\n",
+    "SAVE_FREQ = 2_000\n",
+    "OUTPUT_DIR = f\"outputs/train/{DATASET_NAME}\"\n",
+    "\n",
+    "# Inference - Hub repo ID or local checkpoint path\n",
+    "# e.g. set to f\"{OUTPUT_DIR}/checkpoints/last\" to use a local checkpoint\n",
+    "POLICY_PATH = f\"{HF_USER}/{DATASET_NAME}_{POLICY_TYPE}\"\n",
+    "LAST_CHECKPOINT_PATH = f\"{OUTPUT_DIR}/checkpoints/last\"\n",
+    "\n",
+    "# Derived\n",
+    "DATASET_REPO_ID = f\"{HF_USER}/{DATASET_NAME}\"\n",
+    "DATASET_ROOT = f\"data/{DATASET_NAME}\"\n",
+    "POLICY_REPO_ID = f\"{HF_USER}/{DATASET_NAME}_{POLICY_TYPE}\"\n",
+    "EVAL_REPO_ID = f\"{HF_USER}/eval_{DATASET_NAME}\"\n",
+    "CAMERAS_ARG = _cameras_arg(CAMERAS)\n",
+    "CAMERAS_FLAG = f'--robot.cameras=\"{CAMERAS_ARG}\"' if CAMERAS_ARG else \"\"\n",
+    "\n",
+    "print(f\"Robot  : {ROBOT_TYPE} @ {ROBOT_PORT}\")\n",
+    "print(f\"Teleop : {TELEOP_TYPE} @ {TELEOP_PORT}\")\n",
+    "print(f\"Cameras: {list(CAMERAS) or 'none'}\")\n",
+    "print(f\"Dataset: {DATASET_REPO_ID} ({NUM_EPISODES} episodes) saved to {DATASET_ROOT}\")\n",
+    "print(f\"Policy : {POLICY_TYPE} -> {POLICY_REPO_ID}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 1. Calibration\n",
+    "\n",
+    "Run once per arm before first use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Follower\n",
+    "print_cmd(\n",
+    "    \"lerobot-calibrate\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Leader\n",
+    "print_cmd(\n",
+    "    \"lerobot-calibrate\",\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 2. Teleoperation\n",
+    "\n",
+    "See the [teleoperation docs](https://huggingface.co/docs/lerobot/il_robots#teleoperate) and the [cameras guide](https://huggingface.co/docs/lerobot/cameras) for more options."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-teleoperate\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    \"--display_data=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Record Dataset\n",
+    "\n",
+    "See the [recording docs](https://huggingface.co/docs/lerobot/il_robots#record-a-dataset) for tips on gathering good data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    "    \"--display_data=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Resume a previously interrupted recording session\n",
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--dataset.root={DATASET_ROOT}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    "    \"--display_data=true\",\n",
+    "    \"--resume=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. Train Policy\n",
+    "\n",
+    "See the [training docs](https://huggingface.co/docs/lerobot/il_robots#train-a-policy) for configuration options and tips."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-train\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--policy.type={POLICY_TYPE}\",\n",
+    "    f\"--policy.device={POLICY_DEVICE}\",\n",
+    "    f\"--policy.repo_id={POLICY_REPO_ID}\",\n",
+    "    f\"--output_dir={OUTPUT_DIR}\",\n",
+    "    f\"--steps={TRAIN_STEPS}\",\n",
+    "    f\"--save_freq={SAVE_FREQ}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Resume a previously interrupted training session\n",
+    "print_cmd(\n",
+    "    \"lerobot-train\",\n",
+    "    f\"--config_path={LAST_CHECKPOINT_PATH}/pretrained_model/train_config.json\",\n",
+    "    \"--resume=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. Inference\n",
+    "\n",
+    "Uses `POLICY_PATH` from the Configuration cell (defaults to the Hub repo ID). You can also put there the `LAST_CHECKPOINT_PATH`.\n",
+    "\n",
+    "See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--policy.path={POLICY_PATH}\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={EVAL_REPO_ID}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lerobot (3.12.3)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -108,9 +108,9 @@ training = [
    "wandb>=0.24.0,<0.25.0",
 ]
 hardware = [
-    "pynput>=1.7.8,<1.9.0",
-    "pyserial>=3.5,<4.0",
-    "deepdiff>=7.0.1,<9.0.0",
+    "lerobot[pynput-dep]",
+    "lerobot[pyserial-dep]",
+    "lerobot[deepdiff-dep]",
 ]
 viz = [
    "rerun-sdk>=0.24.0,<0.27.0",
@@ -136,10 +136,14 @@ scipy-dep = ["scipy>=1.14.0,<2.0.0"]
 diffusers-dep = ["diffusers>=0.27.2,<0.36.0"]
 qwen-vl-utils-dep = ["qwen-vl-utils>=0.0.11,<0.1.0"]
 matplotlib-dep = ["matplotlib>=3.10.3,<4.0.0", "contourpy>=1.3.0,<2.0.0"] # NOTE: Explicitly listing contourpy helps the resolver converge faster.
+pyserial-dep = ["pyserial>=3.5,<4.0"]
+deepdiff-dep = ["deepdiff>=7.0.1,<9.0.0"]
+pynput-dep = ["pynput>=1.7.8,<1.9.0"]
+pyzmq-dep = ["pyzmq>=26.2.1,<28.0.0"]

 # Motors
-feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"]
-dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0"]
+feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
+dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
 damiao = ["lerobot[can-dep]"]
 robstride = ["lerobot[can-dep]"]

@@ -147,10 +151,11 @@ robstride = ["lerobot[can-dep]"]
 openarms = ["lerobot[damiao]"]
 gamepad = ["lerobot[pygame-dep]", "hidapi>=0.14.0,<0.15.0"]
 hopejr = ["lerobot[feetech]", "lerobot[pygame-dep]"]
-lekiwi = ["lerobot[feetech]", "pyzmq>=26.2.1,<28.0.0"]
+lekiwi = ["lerobot[feetech]", "lerobot[pyzmq-dep]"]
 unitree_g1 = [
    # "unitree-sdk2==1.0.1",
-    "pyzmq>=26.2.1,<28.0.0",
+    "lerobot[pyzmq-dep]",
+    "lerobot[pyserial-dep]",
    "onnxruntime>=1.16.0,<2.0.0",
    "onnx>=1.16.0,<2.0.0",
    "meshcat>=0.3.0,<0.4.0",
@@ -196,7 +201,8 @@ async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
 peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]

 # Development
-dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1", "ruff>=0.14.1"]
+dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1", "ruff>=0.14.1", "lerobot[notebook]"]
+notebook = ["jupyter>=1.0.0,<2.0.0", "ipykernel>=6.0.0,<7.0.0"]
 test = ["pytest>=8.1.0,<9.0.0", "pytest-timeout>=2.4.0,<3.0.0", "pytest-cov>=5.0.0,<8.0.0", "mock-serial>=0.0.1,<0.1.0 ; sys_platform != 'win32'"]
 video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]

@@ -206,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
 pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
 libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
 metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
+# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
+# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
+# workspace `lerobot` and makes the graph unsolvable under any resolver
+# (uv, pip). Install it manually alongside robosuite — see
+# docs/source/robocasa.mdx for the recipe.

 # All
 all = [
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract natural-language task descriptions for a benchmark suite.
+
+Runs inside the benchmark Docker container (where the env library is installed)
+immediately after lerobot-eval, writing a JSON file that parse_eval_metrics.py
+picks up and embeds in metrics.json.
+
+Output format: {"<suite>_<task_idx>": "<nl instruction>", ...}
+
+Usage:
+    python scripts/ci/extract_task_descriptions.py \\
+        --env libero --task libero_spatial \\
+        --output /tmp/eval-artifacts/task_descriptions.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def _libero_descriptions(task_suite: str) -> dict[str, str]:
+    from libero.libero import benchmark  # type: ignore[import-untyped]
+
+    suite_dict = benchmark.get_benchmark_dict()
+    if task_suite not in suite_dict:
+        print(
+            f"[extract_task_descriptions] Unknown LIBERO suite '{task_suite}'. "
+            f"Available: {list(suite_dict.keys())}",
+            file=sys.stderr,
+        )
+        return {}
+    suite = suite_dict[task_suite]()
+    return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
+
+
+def _metaworld_descriptions(task_name: str) -> dict[str, str]:
+    # MetaWorld tasks don't expose a separate NL description attribute;
+    # use a cleaned version of the task name as the description.
+    label = task_name.removeprefix("metaworld-").replace("-", " ").strip()
+    return {f"{task_name}_0": label}
+
+
+def _robotwin_descriptions(task_names: str) -> dict[str, str]:
+    """Return descriptions for each requested RoboTwin task. Reads
+    `description/task_instruction/<task>.json` from the RoboTwin clone
+    (cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
+    out: dict[str, str] = {}
+    root = Path("description/task_instruction")
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc_file = root / f"{name}.json"
+        desc = name.replace("_", " ")
+        if desc_file.is_file():
+            data = json.loads(desc_file.read_text())
+            full = data.get("full_description") or desc
+            # Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
+            desc = full.replace("<", "").replace(">", "")
+        out[f"{name}_0"] = desc
+    return out
+
+
+def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
+    """For each task in the comma-separated list, emit a cleaned-name label.
+
+    RoboCasa episodes carry their language instruction in the env's
+    `ep_meta['lang']`, populated per reset. Pulling it requires spinning
+    up the full kitchen env per task (~seconds each); we use the task
+    name as the key here and let the eval's episode info carry the
+    actual instruction.
+    """
+    out: dict[str, str] = {}
+    for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+        # Split CamelCase into words: "CloseFridge" → "close fridge".
+        label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
+        out[f"{task}_0"] = label or task
+    return out
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
+    parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
+    parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
+    args = parser.parse_args()
+
+    descriptions: dict[str, str] = {}
+    try:
+        if args.env == "libero":
+            descriptions = _libero_descriptions(args.task)
+        elif args.env == "metaworld":
+            descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robotwin":
+            descriptions = _robotwin_descriptions(args.task)
+        elif args.env == "robocasa":
+            descriptions = _robocasa_descriptions(args.task)
+        else:
+            print(
+                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
+                file=sys.stderr,
+            )
+    except Exception as exc:
+        print(f"[extract_task_descriptions] Warning: {exc}", file=sys.stderr)
+
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(descriptions, indent=2))
+    print(f"[extract_task_descriptions] {len(descriptions)} descriptions → {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Parse lerobot-eval output into a small metrics.json artifact.
+
+Reads eval_info.json written by lerobot-eval --output_dir and extracts the
+key metrics needed by the health dashboard. Handles both single-task and
+multi-task eval output formats.
+
+NOTE: This script runs on the bare CI runner (not inside Docker), so it
+must use only Python stdlib modules. Do not add third-party imports.
+
+Usage:
+    python scripts/ci/parse_eval_metrics.py \\
+        --artifacts-dir /tmp/libero-artifacts \\
+        --env libero \\
+        --task libero_spatial \\
+        --policy pepijn223/smolvla_libero
+
+Writes <artifacts-dir>/metrics.json. The CI workflow then uploads this file
+as a GitHub Actions artifact named "<env>-metrics".
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+
+
+def _safe_float(v: float | int | None) -> float | None:
+    if v is None:
+        return None
+    f = float(v)
+    return None if math.isnan(f) else f
+
+
+def _safe_int(v: float | int | None) -> int | None:
+    if v is None:
+        return None
+    f = float(v)
+    return None if math.isnan(f) else int(f)
+
+
+def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]:
+    """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json.
+
+    Handles two output shapes:
+      - Single-task: {"aggregated": {"pc_success": 80.0, ...}}
+      - Multi-task:  {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}}
+    """
+    for key in ("aggregated", "overall"):
+        if key not in info:
+            continue
+        agg = info[key]
+        pc = agg.get("pc_success")
+        n = agg.get("n_episodes")
+        reward = agg.get("avg_sum_reward")
+        eval_s = agg.get("eval_s")
+
+        if pc is not None and not math.isnan(pc):
+            return (
+                float(pc),
+                _safe_int(n),
+                _safe_float(reward),
+                _safe_float(eval_s),
+            )
+
+    return None, None, None, None
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume")
+    parser.add_argument("--env", required=True, help="Environment name (e.g. libero)")
+    parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)")
+    parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)")
+    args = parser.parse_args()
+
+    artifacts_dir = Path(args.artifacts_dir)
+    eval_info_path = artifacts_dir / "eval_info.json"
+
+    pc_success: float | None = None
+    n_episodes: int | None = None
+    avg_sum_reward: float | None = None
+    eval_s: float | None = None
+
+    if eval_info_path.exists():
+        try:
+            info = json.loads(eval_info_path.read_text())
+            pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info)
+        except (json.JSONDecodeError, KeyError, TypeError) as exc:
+            print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr)
+    else:
+        print(
+            f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.",
+            file=sys.stderr,
+        )
+
+    task_descriptions: dict[str, str] = {}
+    task_desc_path = artifacts_dir / "task_descriptions.json"
+    if task_desc_path.exists():
+        try:
+            task_descriptions = json.loads(task_desc_path.read_text())
+        except json.JSONDecodeError as exc:
+            print(
+                f"[parse_eval_metrics] Warning: could not parse task_descriptions.json: {exc}",
+                file=sys.stderr,
+            )
+
+    metrics = {
+        "env": args.env,
+        "task": args.task,
+        "policy": args.policy,
+        "pc_success": pc_success,
+        "n_episodes": n_episodes,
+        "avg_sum_reward": avg_sum_reward,
+        "eval_s": eval_s,
+        "task_descriptions": task_descriptions,
+    }
+
+    out_path = artifacts_dir / "metrics.json"
+    out_path.write_text(json.dumps(metrics, indent=2))
+    print(f"[parse_eval_metrics] Written: {out_path}")
+    print(json.dumps(metrics, indent=2))
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -33,7 +33,7 @@ import cv2  # type: ignore  # TODO: add type stubs for OpenCV
 import numpy as np  # type: ignore  # TODO: add type stubs for numpy

 from lerobot.utils.decorators import check_if_not_connected
-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 if TYPE_CHECKING or _reachy2_sdk_available:
    from reachy2_sdk.media.camera import CameraView
@@ -76,6 +76,7 @@ class Reachy2Camera(Camera):
        Args:
            config: The configuration settings for the camera.
        """
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -19,16 +19,18 @@ Provides the RealSenseCamera class for capturing frames from Intel RealSense cam
 import logging
 import time
 from threading import Event, Lock, Thread
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import cv2  # type: ignore  # TODO: add type stubs for OpenCV
 import numpy as np  # type: ignore  # TODO: add type stubs for numpy
 from numpy.typing import NDArray  # type: ignore  # TODO: add type stubs for numpy.typing

-try:
-    import pyrealsense2 as rs  # type: ignore  # TODO: add type stubs for pyrealsense2
-except Exception as e:
-    logging.info(f"Could not import realsense: {e}")
+from lerobot.utils.import_utils import _pyrealsense2_available, require_package
+
+if TYPE_CHECKING or _pyrealsense2_available:
+    import pyrealsense2 as rs
+else:
+    rs = None

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
 from lerobot.utils.errors import DeviceNotConnectedError
@@ -112,7 +114,7 @@ class RealSenseCamera(Camera):
        Args:
            config: The configuration settings for the camera.
        """
-
+        require_package("pyrealsense2", extra="intelrealsense")
        super().__init__(config)

        self.config = config
@@ -28,12 +28,19 @@ import json
 import logging
 import time
 from threading import Event, Lock, Thread
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import cv2
 import numpy as np
 from numpy.typing import NDArray

+from lerobot.utils.import_utils import _zmq_available, require_package
+
+if TYPE_CHECKING or _zmq_available:
+    import zmq
+else:
+    zmq = None
+
 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
 from lerobot.utils.errors import DeviceNotConnectedError

@@ -74,8 +81,8 @@ class ZMQCamera(Camera):
    """

    def __init__(self, config: ZMQCameraConfig):
+        require_package("pyzmq", extra="pyzmq-dep", import_name="zmq")
        super().__init__(config)
-        import zmq

        self.config = config
        self.server_address = config.server_address
@@ -117,8 +124,6 @@ class ZMQCamera(Camera):
        logger.info(f"Connecting to {self}...")

        try:
-            import zmq
-
            self.context = zmq.Context()
            self.socket = self.context.socket(zmq.SUB)
            self.socket.setsockopt_string(zmq.SUBSCRIBE, "")
@@ -180,11 +185,8 @@ class ZMQCamera(Camera):

        try:
            message = self.socket.recv_string()
-        except Exception as e:
-            # zmq is lazy-imported in connect(), so check by name to avoid a top-level import
-            if type(e).__name__ == "Again":
-                raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e
-            raise
+        except zmq.Again as e:
+            raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e

        # Decode JSON message
        data = json.loads(message)
@@ -28,6 +28,12 @@ import numpy as np
 import torch

 from lerobot.policies import PreTrainedPolicy, prepare_observation_for_inference
+from lerobot.utils.import_utils import _deepdiff_available, require_package
+
+if TYPE_CHECKING or _deepdiff_available:
+    from deepdiff import DeepDiff
+else:
+    DeepDiff = None

 if TYPE_CHECKING:
    from lerobot.datasets import LeRobotDataset
@@ -217,10 +223,7 @@ def sanity_check_dataset_robot_compatibility(
    Raises:
        ValueError: If any of the checked metadata fields do not match.
    """
-    from lerobot.utils.import_utils import require_package
-
-    require_package("deepdiff", extra="hardware")
-    from deepdiff import DeepDiff
+    require_package("deepdiff", extra="deepdiff-dep")

    from lerobot.utils.constants import DEFAULT_FEATURES

@@ -35,6 +35,9 @@ class DatasetConfig:
    revision: str | None = None
    use_imagenet_stats: bool = True
    video_backend: str = field(default_factory=get_safe_default_codec)
+    # When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
+    # This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
+    return_uint8: bool = False
    streaming: bool = False

    def __post_init__(self) -> None:
@@ -56,6 +56,8 @@ class TrainPipelineConfig(HubMixin):
    # Number of workers for the dataloader.
    num_workers: int = 4
    batch_size: int = 8
+    prefetch_factor: int = 4
+    persistent_workers: bool = True
    steps: int = 100_000
    eval_freq: int = 20_000
    log_freq: int = 200
@@ -16,6 +16,7 @@
 """Private reader component for LeRobotDataset. Handles random-access reading (HF dataset, delta indices, video decoding)."""

 from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

 import datasets
@@ -49,6 +50,7 @@ class DatasetReader:
        video_backend: str,
        delta_timestamps: dict[str, list[float]] | None,
        image_transforms: Callable | None,
+        return_uint8: bool = False,
    ):
        """Initialize the reader with metadata, filtering, and transform config.

@@ -73,6 +75,7 @@ class DatasetReader:
        self._tolerance_s = tolerance_s
        self._video_backend = video_backend
        self._image_transforms = image_transforms
+        self._return_uint8 = return_uint8

        self.hf_dataset: datasets.Dataset | None = None
        self._absolute_to_relative_idx: dict[int, int] | None = None
@@ -105,10 +108,8 @@ class DatasetReader:
        """Build absolute-to-relative index mapping from loaded hf_dataset."""
        self._absolute_to_relative_idx = None
        if self.episodes is not None and self.hf_dataset is not None:
-            self._absolute_to_relative_idx = {
-                abs_idx.item() if isinstance(abs_idx, torch.Tensor) else abs_idx: rel_idx
-                for rel_idx, abs_idx in enumerate(self.hf_dataset["index"])
-            }
+            indices = self.hf_dataset.data.column("index").to_numpy()
+            self._absolute_to_relative_idx = dict(zip(indices.tolist(), range(len(indices)), strict=True))

    @property
    def num_frames(self) -> int:
@@ -235,16 +236,30 @@ class DatasetReader:
        Segmentation Fault.
        """
        ep = self._meta.episodes[ep_idx]
-        item = {}
-        for vid_key, query_ts in query_timestamps.items():
+
+        def _decode_single(vid_key: str, query_ts: list[float]) -> tuple[str, torch.Tensor]:
            from_timestamp = ep[f"videos/{vid_key}/from_timestamp"]
            shifted_query_ts = [from_timestamp + ts for ts in query_ts]
-
            video_path = self.root / self._meta.get_video_file_path(ep_idx, vid_key)
-            frames = decode_video_frames(video_path, shifted_query_ts, self._tolerance_s, self._video_backend)
-            item[vid_key] = frames.squeeze(0)
+            frames = decode_video_frames(
+                video_path,
+                shifted_query_ts,
+                self._tolerance_s,
+                self._video_backend,
+                return_uint8=self._return_uint8,
+            )
+            return vid_key, frames.squeeze(0)

-        return item
+        items = list(query_timestamps.items())
+
+        # Single camera: no threading overhead
+        if len(items) <= 1:
+            return {vid_key: _decode_single(vid_key, query_ts)[1] for vid_key, query_ts in items}
+
+        # Multi-camera: decode in parallel (video decoding releases the GIL)
+        with ThreadPoolExecutor(max_workers=len(items)) as pool:
+            futures = [pool.submit(_decode_single, k, ts) for k, ts in items]
+            return dict(f.result() for f in futures)

    def get_item(self, idx) -> dict:
        """Core __getitem__ logic. Assumes hf_dataset is loaded.
@@ -597,7 +597,7 @@ class DatasetWriter:

    def cleanup_interrupted_episode(self, episode_index: int) -> None:
        """Remove temporary image directories for an interrupted episode."""
-        for key in self._meta.video_keys:
+        for key in self._meta.camera_keys:
            img_dir = self._get_image_file_path(
                episode_index=episode_index, image_key=key, frame_index=0
            ).parent
@@ -92,6 +92,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
                image_transforms=image_transforms,
                revision=cfg.dataset.revision,
                video_backend=cfg.dataset.video_backend,
+                return_uint8=True,
                tolerance_s=cfg.tolerance_s,
            )
        else:
@@ -104,6 +105,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
                revision=cfg.dataset.revision,
                max_num_shards=cfg.num_workers,
                tolerance_s=cfg.tolerance_s,
+                return_uint8=True,
            )
    else:
        raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
@@ -30,13 +30,13 @@ def safe_stop_image_writer(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
-        except Exception as e:
+        except BaseException:
            dataset = kwargs.get("dataset")
            writer = getattr(dataset, "writer", None) if dataset else None
            if writer is not None and writer.image_writer is not None:
                logger.warning("Waiting for image writer to terminate...")
                writer.image_writer.stop()
-            raise e
+            raise

    return wrapper

@@ -56,6 +56,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        force_cache_sync: bool = False,
        download_videos: bool = True,
        video_backend: str | None = None,
+        return_uint8: bool = False,
        batch_encoding_size: int = 1,
        vcodec: str = "libsvtav1",
        streaming_encoding: bool = False,
@@ -202,6 +203,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.tolerance_s = tolerance_s
        self.revision = revision if revision else CODEBASE_VERSION
        self._video_backend = video_backend if video_backend else get_safe_default_codec()
+        self._return_uint8 = return_uint8
        self._batch_encoding_size = batch_encoding_size
        self._vcodec = resolve_vcodec(vcodec)
        self._encoder_threads = encoder_threads
@@ -225,6 +227,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            video_backend=self._video_backend,
            delta_timestamps=delta_timestamps,
            image_transforms=image_transforms,
+            return_uint8=self._return_uint8,
        )

        # Load actual data
@@ -288,6 +291,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                video_backend=self._video_backend,
                delta_timestamps=self.delta_timestamps,
                image_transforms=self.image_transforms,
+                return_uint8=self._return_uint8,
            )
        return self.reader

@@ -683,6 +687,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.delta_timestamps = None
        obj.episodes = None
        obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
+        obj._return_uint8 = False
        obj._batch_encoding_size = batch_encoding_size
        obj._vcodec = vcodec
        obj._encoder_threads = encoder_threads
@@ -775,6 +780,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.delta_timestamps = None
        obj.episodes = None
        obj._video_backend = video_backend if video_backend else get_safe_default_codec()
+        obj._return_uint8 = False
        obj._batch_encoding_size = batch_encoding_size
        obj._vcodec = vcodec
        obj._encoder_threads = encoder_threads
@@ -251,6 +251,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
        seed: int = 42,
        rng: np.random.Generator | None = None,
        shuffle: bool = True,
+        return_uint8: bool = False,
    ):
        """Initialize a StreamingLeRobotDataset.

@@ -288,6 +289,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):

        self.streaming = streaming
        self.buffer_size = buffer_size
+        self._return_uint8 = return_uint8

        # We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown)
        self.video_decoder_cache = None
@@ -553,7 +555,11 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
            root = self.meta.url_root if self.streaming and not self.streaming_from_local else self.root
            video_path = f"{root}/{self.meta.get_video_file_path(ep_idx, video_key)}"
            frames = decode_video_frames_torchcodec(
-                video_path, query_ts, self.tolerance_s, decoder_cache=self.video_decoder_cache
+                video_path,
+                query_ts,
+                self.tolerance_s,
+                decoder_cache=self.video_decoder_cache,
+                return_uint8=self._return_uint8,
            )

            item[video_key] = frames.squeeze(0) if len(query_ts) == 1 else frames
@@ -123,6 +123,7 @@ def decode_video_frames(
    timestamps: list[float],
    tolerance_s: float,
    backend: str | None = None,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """
    Decodes video frames using the specified backend.
@@ -131,19 +132,23 @@ def decode_video_frames(
        video_path (Path): Path to the video file.
        timestamps (list[float]): List of timestamps to extract frames.
        tolerance_s (float): Allowed deviation in seconds for frame retrieval.
-        backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav"..
+        backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav".
+        return_uint8 (bool): If True, return raw uint8 frames without float32 normalization.
+            This reduces memory for DataLoader IPC; normalization can be done on GPU afterward.

    Returns:
-        torch.Tensor: Decoded frames.
+        torch.Tensor: Decoded frames (float32 in [0,1] by default, or uint8 if return_uint8=True).

    Currently supports torchcodec on cpu and pyav.
    """
    if backend is None:
        backend = get_safe_default_codec()
    if backend == "torchcodec":
-        return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s)
+        return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8)
    elif backend in ["pyav", "video_reader"]:
-        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+        return decode_video_frames_torchvision(
+            video_path, timestamps, tolerance_s, backend, return_uint8=return_uint8
+        )
    else:
        raise ValueError(f"Unsupported video backend: {backend}")

@@ -154,6 +159,7 @@ def decode_video_frames_torchvision(
    tolerance_s: float,
    backend: str = "pyav",
    log_loaded_timestamps: bool = False,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """Loads frames associated to the requested timestamps of a video

@@ -240,14 +246,17 @@ def decode_video_frames_torchvision(
    if log_loaded_timestamps:
        logger.info(f"{closest_ts=}")

-    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
-    closest_frames = closest_frames.type(torch.float32) / 255
-
    if len(timestamps) != len(closest_frames):
        raise FrameTimestampError(
            f"Number of retrieved frames ({len(closest_frames)}) does not match "
            f"number of queried timestamps ({len(timestamps)})"
        )
+
+    if return_uint8:
+        return closest_frames
+
+    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
+    closest_frames = closest_frames.type(torch.float32) / 255
    return closest_frames


@@ -306,6 +315,7 @@ def decode_video_frames_torchcodec(
    tolerance_s: float,
    log_loaded_timestamps: bool = False,
    decoder_cache: VideoDecoderCache | None = None,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """Loads frames associated with the requested timestamps of a video using torchcodec.

@@ -373,14 +383,16 @@ def decode_video_frames_torchcodec(
    if log_loaded_timestamps:
        logger.info(f"{closest_ts=}")

-    # convert to float32 in [0,1] range
-    closest_frames = (closest_frames / 255.0).type(torch.float32)
-
    if not len(timestamps) == len(closest_frames):
        raise FrameTimestampError(
            f"Retrieved timestamps differ from queried {set(closest_frames) - set(timestamps)}"
        )

+    if return_uint8:
+        return closest_frames
+
+    # convert to float32 in [0,1] range
+    closest_frames = (closest_frames / 255.0).type(torch.float32)
    return closest_frames


@@ -496,6 +496,81 @@ class MetaworldEnv(EnvConfig):
        )


+@EnvConfig.register_subclass("robocasa")
+@dataclass
+class RoboCasaEnv(EnvConfig):
+    task: str = "CloseFridge"
+    fps: int = 20
+    episode_length: int = 1000
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
+    observation_height: int = 256
+    observation_width: int = 256
+    visualization_height: int = 512
+    visualization_width: int = 512
+    split: str | None = None
+    # Object-mesh registries to sample from. Upstream default is
+    # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
+    # only ships the lightwheel pack. Override to include objaverse once
+    # you've run `python -m robocasa.scripts.download_kitchen_assets
+    # --type objaverse` locally.
+    obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
+    )
+    features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
+
+    def __post_init__(self):
+        if self.obs_type not in ("pixels", "pixels_agent_pos"):
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        # Preserve raw RoboCasa camera names end-to-end (e.g.
+        # `observation.images.robot0_agentview_left`). This matches the
+        # naming convention used by the RoboCasa datasets on the Hub, so
+        # trained policies don't need a `--rename_map` at eval time.
+        cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
+        for cam in cams:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
+
+    @property
+    def gym_kwargs(self) -> dict:
+        kwargs: dict[str, Any] = {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "observation_height": self.observation_height,
+            "observation_width": self.observation_width,
+            "visualization_height": self.visualization_height,
+            "visualization_width": self.visualization_width,
+        }
+        if self.split is not None:
+            kwargs["split"] = self.split
+        return kwargs
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = False):
+        from .robocasa import create_robocasa_envs
+
+        if self.task is None:
+            raise ValueError("RoboCasaEnv requires a task to be specified")
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robocasa_envs(
+            task=self.task,
+            n_envs=n_envs,
+            camera_name=self.camera_name,
+            gym_kwargs=self.gym_kwargs,
+            env_cls=env_cls,
+            episode_length=self.episode_length,
+            obj_registries=tuple(self.obj_registries),
+        )
+
+
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
 class IsaaclabArenaEnv(HubEnvConfig):
@@ -574,3 +649,90 @@ class IsaaclabArenaEnv(HubEnvConfig):
            ),
            PolicyProcessorPipeline(steps=[]),
        )
+
+
+@EnvConfig.register_subclass("robotwin")
+@dataclass
+class RoboTwinEnvConfig(EnvConfig):
+    """Configuration for RoboTwin 2.0 benchmark environments.
+
+    RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
+    SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
+    (7 per arm). All three cameras are enabled by default.
+
+    See: https://robotwin-platform.github.io
+    Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
+    """
+
+    task: str = "beat_block_hammer"  # single task or comma-separated list
+    fps: int = 25
+    episode_length: int = 300
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
+    # (torso-mounted) + left_camera / right_camera (wrists).
+    camera_names: str = "head_camera,left_camera,right_camera"
+    # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
+    # Gym's vector-env concatenate pre-allocates buffers of this shape, so it
+    # must equal what SAPIEN actually renders.
+    observation_height: int = 240
+    observation_width: int = 320
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "pixels/head_camera": f"{OBS_IMAGES}.head_camera",
+            "pixels/left_camera": f"{OBS_IMAGES}.left_camera",
+            "pixels/right_camera": f"{OBS_IMAGES}.right_camera",
+            "agent_pos": OBS_STATE,
+        }
+    )
+
+    def __post_init__(self):
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        for cam in cam_list:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            # Keep features_map entry if already set (default_factory); add if missing.
+            key = f"pixels/{cam}"
+            if key not in self.features_map:
+                self.features_map[key] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(14,),  # 14 DOF: 7 per arm
+            )
+        elif self.obs_type != "pixels":
+            raise ValueError(
+                f"Unsupported obs_type '{self.obs_type}'. "
+                "RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
+            )
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from lerobot.envs.robotwin import create_robotwin_envs
+
+        if not self.task:
+            raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        return create_robotwin_envs(
+            task=self.task,
+            n_envs=n_envs,
+            env_cls=env_cls,
+            camera_names=cam_list,
+            observation_height=self.observation_height,
+            observation_width=self.observation_width,
+            episode_length=self.episode_length,
+        )
@@ -31,20 +31,7 @@ from libero.libero.envs import OffScreenRenderEnv

 from lerobot.types import RobotObservation

-from .utils import _LazyAsyncVectorEnv
-
-
-def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
-    """Normalize camera_name into a non-empty list of strings."""
-    if isinstance(camera_name, str):
-        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
-    elif isinstance(camera_name, (list | tuple)):
-        cams = [str(c).strip() for c in camera_name if str(c).strip()]
-    else:
-        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
-    if not cams:
-        raise ValueError("camera_name resolved to an empty list.")
-    return cams
+from .utils import _LazyAsyncVectorEnv, parse_camera_names


 def _get_suite(name: str) -> benchmark.Benchmark:
@@ -128,7 +115,7 @@ class LiberoEnv(gym.Env):
        self.visualization_width = visualization_width
        self.visualization_height = visualization_height
        self.init_states = init_states
-        self.camera_name = _parse_camera_names(
+        self.camera_name = parse_camera_names(
            camera_name
        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)

@@ -437,7 +424,7 @@ def create_libero_envs(
    gym_kwargs = dict(gym_kwargs or {})
    task_ids_filter = gym_kwargs.pop("task_ids", None)  # optional: limit to specific tasks

-    camera_names = _parse_camera_names(camera_name)
+    camera_names = parse_camera_names(camera_name)
    suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
    if not suite_names:
        raise ValueError("`task` must contain at least one LIBERO suite name.")
@@ -462,6 +449,7 @@ def create_libero_envs(
        # Probe once and reuse to avoid creating a temp env per task.
        cached_obs_space: spaces.Space | None = None
        cached_act_space: spaces.Space | None = None
+        cached_metadata: dict[str, Any] | None = None

        for tid in selected:
            fns = _make_env_fns(
@@ -477,10 +465,11 @@ def create_libero_envs(
                camera_name_mapping=camera_name_mapping,
            )
            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[suite_name][tid] = lazy
            else:
                out[suite_name][tid] = env_cls(fns)
@@ -311,6 +311,7 @@ def create_metaworld_envs(
    is_async = env_cls is gym.vector.AsyncVectorEnv
    cached_obs_space = None
    cached_act_space = None
+    cached_metadata = None
    out: dict[str, dict[int, Any]] = defaultdict(dict)

    for group in task_groups:
@@ -324,10 +325,11 @@ def create_metaworld_envs(
            fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]

            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[group][tid] = lazy
            else:
                out[group][tid] = env_cls(fns)
@@ -0,0 +1,425 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv, parse_camera_names
+
+logger = logging.getLogger(__name__)
+
+# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
+# These correspond to the PandaOmron robot in RoboCasa365.
+OBS_STATE_DIM = 16  # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
+ACTION_DIM = 12  # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+
+# Default PandaOmron cameras. We surface these raw names directly as
+# `observation.images.<name>` so the LeRobot dataset/policy keys match
+# RoboCasa's native convention (no implicit renaming).
+DEFAULT_CAMERAS = [
+    "robot0_agentview_left",
+    "robot0_eye_in_hand",
+    "robot0_agentview_right",
+]
+
+# Object-mesh registries to sample from. RoboCasa's upstream default is
+# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
+# most users — including our CI image — only download the lightwheel pack
+# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
+# category has zero candidates in every registry, robocasa crashes with
+# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
+# normalization). Restricting to registries that are actually on disk
+# avoids the NaN and matches what the asset download provides.
+DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
+
+# Task-group shortcuts accepted as `--env.task`. When the user passes one of
+# these names, we expand it to the upstream RoboCasa task list and auto-set
+# the dataset split. Individual task names (optionally comma-separated) still
+# take precedence; this only triggers on an exact group-name match.
+_TASK_GROUP_SPLITS = {
+    "atomic_seen": "target",
+    "composite_seen": "target",
+    "composite_unseen": "target",
+    "pretrain50": "pretrain",
+    "pretrain100": "pretrain",
+    "pretrain200": "pretrain",
+    "pretrain300": "pretrain",
+}
+
+
+def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
+    """Resolve a `--env.task` value to (task_names, split_override).
+
+    If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
+    expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
+    and return the matching split. Otherwise treat `task` as a single task or
+    comma-separated list and leave the split untouched (None).
+    """
+    key = task.strip()
+    if key in _TASK_GROUP_SPLITS:
+        from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
+
+        combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
+        if key not in combined:
+            raise ValueError(
+                f"Task group '{key}' is not available in this version of robocasa. "
+                f"Known groups: {sorted(combined.keys())}."
+            )
+        return list(combined[key]), _TASK_GROUP_SPLITS[key]
+
+    names = [t.strip() for t in task.split(",") if t.strip()]
+    if not names:
+        raise ValueError("`task` must contain at least one RoboCasa task name.")
+    return names, None
+
+
+def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
+    """Split a flat (12,) action vector into a RoboCasa action dict.
+
+    Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+    """
+    return {
+        "action.base_motion": flat_action[0:4],
+        "action.control_mode": flat_action[4:5],
+        "action.end_effector_position": flat_action[5:8],
+        "action.end_effector_rotation": flat_action[8:11],
+        "action.gripper_close": flat_action[11:12],
+    }
+
+
+class RoboCasaEnv(gym.Env):
+    """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
+
+    Wraps RoboCasaGymEnv from the robocasa package and converts its
+    dict-based observations and actions into the flat arrays LeRobot expects.
+    Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
+
+    def __init__(
+        self,
+        task: str,
+        camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+        obs_type: str = "pixels_agent_pos",
+        render_mode: str = "rgb_array",
+        observation_width: int = 256,
+        observation_height: int = 256,
+        visualization_width: int = 512,
+        visualization_height: int = 512,
+        split: str | None = None,
+        episode_length: int | None = None,
+        obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+        episode_index: int = 0,
+    ):
+        super().__init__()
+        self.task = task
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.observation_width = observation_width
+        self.observation_height = observation_height
+        self.visualization_width = visualization_width
+        self.visualization_height = visualization_height
+        self.split = split
+        self.obj_registries = tuple(obj_registries)
+        # Per-worker index (0..n_envs-1) used to spread the user-provided
+        # seed across factories so each sub-env explores a distinct layout
+        # even when the same seed is passed to `reset()`.
+        self.episode_index = int(episode_index)
+
+        self.camera_name = parse_camera_names(camera_name)
+
+        self._max_episode_steps = episode_length if episode_length is not None else 1000
+
+        # Deferred — created on first reset() inside the worker subprocess
+        # to avoid inheriting stale GPU/EGL contexts across fork().
+        self._env: Any = None
+        self.task_description = ""
+
+        images = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_name
+        }
+
+        if self.obs_type == "pixels":
+            self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                    "agent_pos": spaces.Box(
+                        low=-np.inf,
+                        high=np.inf,
+                        shape=(OBS_STATE_DIM,),
+                        dtype=np.float32,
+                    ),
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
+
+        self.action_space = spaces.Box(
+            low=ACTION_LOW,
+            high=ACTION_HIGH,
+            shape=(ACTION_DIM,),
+            dtype=np.float32,
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the underlying RoboCasaGymEnv on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own clean rendering context rather than inheriting a stale one from
+        the parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
+
+        # RoboCasaGymEnv defaults split="test", which create_env rejects
+        # (only None/"all"/"pretrain"/"target" are valid). Always pass a
+        # valid value so we don't hit that default. Extra kwargs are
+        # forwarded to the underlying kitchen env via create_env/robosuite.make.
+        self._env = RoboCasaGymEnv(
+            env_name=self.task,
+            camera_widths=self.observation_width,
+            camera_heights=self.observation_height,
+            split=self.split if self.split is not None else "all",
+            obj_registries=self.obj_registries,
+        )
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+    def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
+        """Convert RoboCasaGymEnv observation dict to LeRobot format."""
+        # RoboCasaGymEnv emits camera frames under "video.<cam>".
+        images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
+
+        if self.obs_type == "pixels":
+            return {"pixels": images}
+
+        # `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
+        agent_pos = np.concatenate(
+            [
+                raw_obs.get("state.base_position", np.zeros(3)),
+                raw_obs.get("state.base_rotation", np.zeros(4)),
+                raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
+                raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
+                raw_obs.get("state.gripper_qpos", np.zeros(2)),
+            ],
+            axis=-1,
+        ).astype(np.float32)
+
+        return {"pixels": images, "agent_pos": agent_pos}
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        assert self._env is not None
+        return self._env.render()
+
+    def reset(self, seed=None, **kwargs):
+        self._ensure_env()
+        assert self._env is not None
+        super().reset(seed=seed)
+        # Spread the seed across workers so n_envs factories don't all
+        # roll the same scene. With an explicit user seed we shift it by
+        # episode_index; with no seed we fall back to episode_index so
+        # each worker is still distinct rather than inheriting the same
+        # global RNG state.
+        worker_seed = seed + self.episode_index if seed is not None else self.episode_index
+        raw_obs, info = self._env.reset(seed=worker_seed)
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+        observation = self._format_raw_obs(raw_obs)
+        info = {"is_success": False}
+        return observation, info
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        self._ensure_env()
+        assert self._env is not None
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
+
+        action_dict = convert_action(action)
+        raw_obs, reward, done, truncated, info = self._env.step(action_dict)
+
+        is_success = bool(info.get("success", False))
+        terminated = done or is_success
+        info.update({"task": self.task, "done": done, "is_success": is_success})
+
+        observation = self._format_raw_obs(raw_obs)
+        if terminated:
+            info["final_info"] = {
+                "task": self.task,
+                "done": bool(done),
+                "is_success": bool(is_success),
+            }
+            self.reset()
+
+        return observation, reward, terminated, truncated, info
+
+    def close(self):
+        if self._env is not None:
+            self._env.close()
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    camera_names: list[str],
+    obs_type: str,
+    render_mode: str,
+    observation_width: int,
+    observation_height: int,
+    visualization_width: int,
+    visualization_height: int,
+    split: str | None,
+    episode_length: int | None,
+    obj_registries: Sequence[str],
+) -> list[Callable[[], RoboCasaEnv]]:
+    """Build n_envs factory callables for a single task.
+
+    Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
+    ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
+    user-provided seed.
+    """
+
+    def _make_env(episode_index: int) -> RoboCasaEnv:
+        return RoboCasaEnv(
+            task=task,
+            camera_name=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+            episode_index=episode_index,
+        )
+
+    return [partial(_make_env, i) for i in range(n_envs)]
+
+
+def create_robocasa_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] | None = None,
+    camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    episode_length: int | None = None,
+    obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboCasa365 environments with a consistent return shape.
+
+    Returns:
+        dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+    `task` can be:
+      - a single task name (e.g. `CloseFridge`)
+      - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
+      - a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
+        `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
+        `pretrain300`), which auto-expands to the upstream task list and
+        auto-sets the dataset `split` ("target" or "pretrain").
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    gym_kwargs = dict(gym_kwargs or {})
+    obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
+    render_mode = gym_kwargs.pop("render_mode", "rgb_array")
+    observation_width = gym_kwargs.pop("observation_width", 256)
+    observation_height = gym_kwargs.pop("observation_height", 256)
+    visualization_width = gym_kwargs.pop("visualization_width", 512)
+    visualization_height = gym_kwargs.pop("visualization_height", 512)
+    split = gym_kwargs.pop("split", None)
+
+    camera_names = parse_camera_names(camera_name)
+    task_names, group_split = _resolve_tasks(str(task))
+    if group_split is not None and split is None:
+        split = group_split
+
+    logger.info(
+        "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
+        task_names,
+        split,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task=task_name,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+        )
+
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {name: dict(task_map) for name, task_map in out.items()}
@@ -0,0 +1,488 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import importlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
+# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
+ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
+    "head_camera",
+    "left_camera",
+    "right_camera",
+)
+
+ACTION_DIM = 14  # 7 DOF × 2 arms
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+DEFAULT_EPISODE_LENGTH = 300
+# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
+DEFAULT_CAMERA_H = 240
+DEFAULT_CAMERA_W = 320
+
+# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
+# (50 tasks as of main; earlier revisions had 60 with a different split).
+# Keep this in sync with:
+#   gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
+#     | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
+ROBOTWIN_TASKS: tuple[str, ...] = (
+    "adjust_bottle",
+    "beat_block_hammer",
+    "blocks_ranking_rgb",
+    "blocks_ranking_size",
+    "click_alarmclock",
+    "click_bell",
+    "dump_bin_bigbin",
+    "grab_roller",
+    "handover_block",
+    "handover_mic",
+    "hanging_mug",
+    "lift_pot",
+    "move_can_pot",
+    "move_pillbottle_pad",
+    "move_playingcard_away",
+    "move_stapler_pad",
+    "open_laptop",
+    "open_microwave",
+    "pick_diverse_bottles",
+    "pick_dual_bottles",
+    "place_a2b_left",
+    "place_a2b_right",
+    "place_bread_basket",
+    "place_bread_skillet",
+    "place_burger_fries",
+    "place_can_basket",
+    "place_cans_plasticbox",
+    "place_container_plate",
+    "place_dual_shoes",
+    "place_empty_cup",
+    "place_fan",
+    "place_mouse_pad",
+    "place_object_basket",
+    "place_object_scale",
+    "place_object_stand",
+    "place_phone_stand",
+    "place_shoe",
+    "press_stapler",
+    "put_bottles_dustbin",
+    "put_object_cabinet",
+    "rotate_qrcode",
+    "scan_object",
+    "shake_bottle",
+    "shake_bottle_horizontally",
+    "stack_blocks_three",
+    "stack_blocks_two",
+    "stack_bowls_three",
+    "stack_bowls_two",
+    "stamp_seal",
+    "turn_switch",
+)
+
+
+_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
+
+
+def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
+    """Build the kwargs dict RoboTwin's setup_demo expects.
+
+    Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
+    reads ``task_config/demo_clean.yml``, resolves the embodiment file from
+    ``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
+    reads camera dimensions from ``_camera_config.yml``.
+
+    Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
+    used by beat_block_hammer and most smoke-test tasks).
+    """
+    if task_name in _ROBOTWIN_SETUP_CACHE:
+        return dict(_ROBOTWIN_SETUP_CACHE[task_name])
+
+    import os
+
+    import yaml  # type: ignore[import-untyped]
+    from envs import CONFIGS_PATH  # type: ignore[import-not-found]
+
+    task_config = "demo_clean"
+    with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+
+    # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
+    with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
+        embodiment_types = yaml.safe_load(f)
+    embodiment = args.get("embodiment", ["aloha-agilex"])
+    if len(embodiment) == 1:
+        robot_file = embodiment_types[embodiment[0]]["file_path"]
+        args["left_robot_file"] = robot_file
+        args["right_robot_file"] = robot_file
+        args["dual_arm_embodied"] = True
+    elif len(embodiment) == 3:
+        args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
+        args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
+        args["embodiment_dis"] = embodiment[2]
+        args["dual_arm_embodied"] = False
+    else:
+        raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
+
+    with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["left_embodiment_config"] = yaml.safe_load(f)
+    with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["right_embodiment_config"] = yaml.safe_load(f)
+
+    # Camera dimensions
+    with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
+        camera_config = yaml.safe_load(f)
+    head_cam = args["camera"]["head_camera_type"]
+    args["head_camera_h"] = camera_config[head_cam]["h"]
+    args["head_camera_w"] = camera_config[head_cam]["w"]
+
+    # Headless overrides
+    args["render_freq"] = 0
+    args["task_name"] = task_name
+    args["task_config"] = task_config
+
+    _ROBOTWIN_SETUP_CACHE[task_name] = args
+    return dict(args)
+
+
+def _load_robotwin_task(task_name: str) -> type:
+    """Dynamically import and return a RoboTwin 2.0 task class.
+
+    RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
+    root and are expected to be on ``sys.path`` after installation.
+    """
+    try:
+        module = importlib.import_module(f"envs.{task_name}")
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            f"Could not import RoboTwin task '{task_name}'. "
+            "Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
+            "See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
+        ) from e
+    task_cls = getattr(module, task_name, None)
+    if task_cls is None:
+        raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
+    return task_cls
+
+
+class RoboTwinEnv(gym.Env):
+    """Gymnasium wrapper around a single RoboTwin 2.0 task.
+
+    RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
+    ``take_action`` / ``check_success``) rather than the standard gym interface.
+    This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
+    RoboTwin exactly like LIBERO or Meta-World.
+
+    The underlying SAPIEN environment is created lazily on the first ``reset()``
+    call *inside the worker process*.  This is required for
+    ``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
+    contexts that must not be forked from the parent process.
+
+    Observations
+    ------------
+    The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
+    ``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
+    ``envs/utils.py`` then converts these to ``observation.images.<cam>``.
+
+    Actions
+    -------
+    14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
+
+    Autograd
+    --------
+    ``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
+    optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
+    the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
+
+    def __init__(
+        self,
+        task_name: str,
+        episode_index: int = 0,
+        n_envs: int = 1,
+        camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+        observation_height: int | None = None,
+        observation_width: int | None = None,
+        episode_length: int = DEFAULT_EPISODE_LENGTH,
+        render_mode: str = "rgb_array",
+    ):
+        super().__init__()
+        self.task_name = task_name
+        self.task = task_name  # used by add_envs_task() in utils.py
+        self.task_description = task_name.replace("_", " ")
+        self.episode_index = episode_index
+        self._reset_stride = n_envs
+        self.camera_names = list(camera_names)
+        # Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
+        # The YAML-driven lookup is deferred to reset() so construction doesn't
+        # import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
+        self.observation_height = observation_height or DEFAULT_CAMERA_H
+        self.observation_width = observation_width or DEFAULT_CAMERA_W
+        self.episode_length = episode_length
+        self._max_episode_steps = episode_length  # lerobot_eval.rollout reads this
+        self.render_mode = render_mode
+
+        self._env: Any | None = None  # deferred — created on first reset() inside worker
+        self._step_count: int = 0
+        self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
+
+        image_spaces = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_names
+        }
+        self.observation_space = spaces.Dict(
+            {
+                "pixels": spaces.Dict(image_spaces),
+                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
+            }
+        )
+        self.action_space = spaces.Box(
+            low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the SAPIEN environment on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own EGL/GPU context rather than inheriting a stale one from the
+        parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        task_cls = _load_robotwin_task(self.task_name)
+        self._env = task_cls()
+
+    def _get_obs(self) -> RobotObservation:
+        assert self._env is not None, "_get_obs called before _ensure_env()"
+        raw = self._env.get_obs()
+        cameras_raw = raw.get("observation", {})
+
+        images: dict[str, np.ndarray] = {}
+        for cam in self.camera_names:
+            cam_data = cameras_raw.get(cam)
+            img = cam_data.get("rgb") if cam_data else None
+            if img is None:
+                images[cam] = self._black_frame
+                continue
+            img = np.asarray(img, dtype=np.uint8)
+            if img.ndim == 2:
+                img = np.stack([img, img, img], axis=-1)
+            elif img.shape[-1] != 3:
+                img = img[..., :3]
+            images[cam] = img
+
+        ja = raw.get("joint_action") or {}
+        vec = ja.get("vector")
+        if vec is not None:
+            arr = np.asarray(vec, dtype=np.float32).ravel()
+            joint_state = (
+                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
+            )
+        else:
+            joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
+
+        return {"pixels": images, "agent_pos": joint_state}
+
+    def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
+        self._ensure_env()
+        super().reset(seed=seed)
+        assert self._env is not None  # set by _ensure_env() above
+
+        actual_seed = self.episode_index if seed is None else seed
+        setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
+        setup_kwargs.update(seed=actual_seed, is_test=True)
+        with torch.enable_grad():
+            self._env.setup_demo(**setup_kwargs)
+        self.episode_index += self._reset_stride
+        self._step_count = 0
+
+        obs = self._get_obs()
+        return obs, {"is_success": False, "task": self.task_name}
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        assert self._env is not None, "step() called before reset()"
+        if action.ndim != 1 or action.shape[0] != ACTION_DIM:
+            raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
+
+        with torch.enable_grad():
+            if hasattr(self._env, "take_action"):
+                self._env.take_action(action)
+            else:
+                self._env.step(action)
+
+        self._step_count += 1
+
+        is_success = bool(getattr(self._env, "eval_success", False))
+        if not is_success and hasattr(self._env, "check_success"):
+            is_success = bool(self._env.check_success())
+
+        obs = self._get_obs()
+        reward = float(is_success)
+        terminated = is_success
+        truncated = self._step_count >= self.episode_length
+
+        info: dict[str, Any] = {
+            "task": self.task_name,
+            "is_success": is_success,
+            "step": self._step_count,
+        }
+        if terminated or truncated:
+            info["final_info"] = {
+                "task": self.task_name,
+                "is_success": is_success,
+            }
+            self.reset()
+
+        return obs, reward, terminated, truncated, info
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        obs = self._get_obs()
+        # Prefer head camera for rendering; fall back to first available.
+        if "head_camera" in obs["pixels"]:
+            return obs["pixels"]["head_camera"]
+        return next(iter(obs["pixels"].values()))
+
+    def close(self) -> None:
+        if self._env is not None:
+            if hasattr(self._env, "close_env"):
+                import contextlib
+
+                with contextlib.suppress(TypeError):
+                    self._env.close_env()
+            self._env = None
+
+
+# ---- Multi-task factory --------------------------------------------------------
+
+
+def _make_env_fns(
+    *,
+    task_name: str,
+    n_envs: int,
+    camera_names: list[str],
+    observation_height: int,
+    observation_width: int,
+    episode_length: int,
+) -> list[Callable[[], RoboTwinEnv]]:
+    """Return n_envs factory callables for a single task."""
+
+    def _make_one(episode_index: int) -> RoboTwinEnv:
+        return RoboTwinEnv(
+            task_name=task_name,
+            episode_index=episode_index,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+
+    return [partial(_make_one, i) for i in range(n_envs)]
+
+
+def create_robotwin_envs(
+    task: str,
+    n_envs: int,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+    observation_height: int = DEFAULT_CAMERA_H,
+    observation_width: int = DEFAULT_CAMERA_W,
+    episode_length: int = DEFAULT_EPISODE_LENGTH,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboTwin 2.0 environments.
+
+    Returns:
+        ``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
+        ``n_envs`` parallel rollouts.
+
+    Args:
+        task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
+            or ``"beat_block_hammer,click_bell"``).
+        n_envs: Number of parallel rollouts per task.
+        env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
+        camera_names: Cameras to include in observations.
+        observation_height: Pixel height for all cameras.
+        observation_width: Pixel width for all cameras.
+        episode_length: Max steps before truncation.
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    task_names = [t.strip() for t in str(task).split(",") if t.strip()]
+    if not task_names:
+        raise ValueError("`task` must contain at least one RoboTwin task name.")
+
+    unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
+    if unknown:
+        raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
+
+    logger.info(
+        "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
+        task_names,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task_name=task_name,
+            n_envs=n_envs,
+            camera_names=list(camera_names),
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {k: dict(v) for k, v in out.items()}
@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
 from .configs import EnvConfig


+def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
+    """Normalize ``camera_name`` into a non-empty list of strings.
+
+    Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
+    strings (tuples/lists). Whitespace is stripped; empty entries are
+    dropped. Raises ``TypeError`` for unsupported input types and
+    ``ValueError`` when the normalized list is empty.
+    """
+    if isinstance(camera_name, str):
+        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
+    elif isinstance(camera_name, (list | tuple)):
+        cams = [str(c).strip() for c in camera_name if str(c).strip()]
+    else:
+        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
+    if not cams:
+        raise ValueError("camera_name resolved to an empty list.")
+    return cams
+
+
 def _convert_nested_dict(d):
    result = {}
    for k, v in d.items():
@@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv:
        env_fns: list[Callable],
        observation_space=None,
        action_space=None,
+        metadata=None,
    ):
        self._env_fns = env_fns
        self._env: gym.vector.AsyncVectorEnv | None = None
        self.num_envs = len(env_fns)
-        if observation_space is not None and action_space is not None:
+        if observation_space is not None and action_space is not None and metadata is not None:
            self.observation_space = observation_space
            self.action_space = action_space
+            self.metadata = metadata
        else:
            tmp = env_fns[0]()
            self.observation_space = tmp.observation_space
            self.action_space = tmp.action_space
+            self.metadata = tmp.metadata
            tmp.close()
        self.single_observation_space = self.observation_space
        self.single_action_space = self.action_space
@@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv:
        if self._env is None:
            self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)

+    @property
+    def unwrapped(self):
+        return self
+
    def reset(self, **kwargs):
        self._ensure()
        return self._env.reset(**kwargs)
@@ -12,8 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import numpy as np

+from lerobot.utils.import_utils import _placo_available, require_package
+
+if TYPE_CHECKING or _placo_available:
+    import placo  # type: ignore[import-not-found]
+else:
+    placo = None
+

 class RobotKinematics:
    """Robot kinematics using placo library for forward and inverse kinematics."""
@@ -32,13 +43,7 @@ class RobotKinematics:
            target_frame_name (str): Name of the end-effector frame in the URDF
            joint_names (list[str] | None): List of joint names to use for the kinematics solver
        """
-        try:
-            import placo  # type: ignore[import-not-found] # C++ library with Python bindings, no type stubs available. TODO: Create stub file or request upstream typing support.
-        except ImportError as e:
-            raise ImportError(
-                "placo is required for RobotKinematics. "
-                "Please install the optional dependencies of `kinematics` in the package."
-            ) from e
+        require_package("placo", extra="placo-dep")

        self.robot = placo.RobotWrapper(urdf_path)
        self.solver = placo.KinematicsSolver(self.robot)
@@ -24,7 +24,7 @@ from functools import cached_property
 from typing import TYPE_CHECKING, Any, TypedDict

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _can_available
+from lerobot.utils.import_utils import _can_available, require_package

 if TYPE_CHECKING or _can_available:
    import can
@@ -111,6 +111,7 @@ class DamiaoMotorsBus(MotorsBusBase):
            bitrate: Nominal bitrate in bps (default: 1000000 = 1 Mbps)
            data_bitrate: Data bitrate for CAN FD in bps (default: 5000000 = 5 Mbps), ignored if use_can_fd is False
        """
+        require_package("python-can", extra="damiao", import_name="can")
        super().__init__(port, motors, calibration)
        self.port = port
        self.can_interface = can_interface
@@ -216,6 +216,14 @@ class FeetechMotorsBus(SerialMotorsBus):
                self.write("Maximum_Acceleration", motor, maximum_acceleration)
            self.write("Acceleration", motor, acceleration)

+            # Clear bit 4 (0x10) of the Phase register (0x12) to set angle feedback mode to 0.
+            # This forces position readings to be in the range [0, resolution - 1] and prevents overflow or negative values.
+            # Only known to be necessary for the STS3215.
+            if self.motors[motor].model == "sts3215":
+                phase = self.read("Phase", motor, normalize=False)
+                if phase & 0x10:
+                    self.write("Phase", motor, phase & ~0x10)
+
    @property
    def is_calibrated(self) -> bool:
        motors_calibration = self.read_calibration()
@@ -356,8 +356,8 @@ class SerialMotorsBus(MotorsBusBase):
        motors: dict[str, Motor],
        calibration: dict[str, MotorCalibration] | None = None,
    ):
-        require_package("pyserial", extra="hardware", import_name="serial")
-        require_package("deepdiff", extra="hardware")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
+        require_package("deepdiff", extra="deepdiff-dep")
        super().__init__(port, motors, calibration)

        self.port_handler: PortHandler
@@ -23,12 +23,12 @@ from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, TypedDict

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _can_available
+from lerobot.utils.import_utils import _can_available, require_package

 if TYPE_CHECKING or _can_available:
    import can
 else:
-    can = SimpleNamespace(Message=object, interface=None)
+    can = SimpleNamespace(Message=object, interface=None, BusABC=object)
 import numpy as np

 from lerobot.utils.errors import DeviceNotConnectedError
@@ -106,6 +106,7 @@ class RobstrideMotorsBus(MotorsBusBase):
            bitrate: Nominal bitrate in bps (default: 1000000 = 1 Mbps)
            data_bitrate: Data bitrate for CAN FD in bps (default: 5000000 = 5 Mbps), ignored if use_can_fd is False
        """
+        require_package("python-can", extra="robstride", import_name="can")
        super().__init__(port, motors, calibration)
        self.port = port
        self.can_interface = can_interface
@@ -18,14 +18,21 @@ import logging
 import math
 from dataclasses import asdict, dataclass
 from pathlib import Path
+from typing import TYPE_CHECKING

 import draccus
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler

 from lerobot.utils.constants import SCHEDULER_STATE
+from lerobot.utils.import_utils import _diffusers_available, require_package
 from lerobot.utils.io_utils import deserialize_json_into_object, write_json

+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.optimization import get_scheduler
+else:
+    get_scheduler = None
+

@dataclass
 class LRSchedulerConfig(draccus.ChoiceRegistry, abc.ABC):
@@ -47,10 +54,7 @@ class DiffuserSchedulerConfig(LRSchedulerConfig):
    num_warmup_steps: int | None = None

    def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
-        from lerobot.utils.import_utils import require_package
-
        require_package("diffusers", extra="diffusion")
-        from diffusers.optimization import get_scheduler

        kwargs = {**asdict(self), "num_training_steps": num_training_steps, "optimizer": optimizer}
        return get_scheduler(**kwargs)
@@ -23,6 +23,7 @@ TODO(alexander-soare):
 import math
 from collections import deque
 from collections.abc import Callable
+from typing import TYPE_CHECKING

 import einops
 import numpy as np
@@ -32,6 +33,14 @@ import torchvision
 from torch import Tensor, nn

 from lerobot.utils.constants import ACTION, OBS_ENV_STATE, OBS_IMAGES, OBS_STATE
+from lerobot.utils.import_utils import _diffusers_available, require_package
+
+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+else:
+    DDIMScheduler = None
+    DDPMScheduler = None

 from ..pretrained import PreTrainedPolicy
 from ..utils import (
@@ -64,6 +73,7 @@ class DiffusionPolicy(PreTrainedPolicy):
            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
                that they will be passed with a call to `load_state_dict` before the policy is used.
        """
+        require_package("diffusers", extra="diffusion")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -155,11 +165,7 @@ def _make_noise_scheduler(name: str, **kwargs: dict):
    Factory for noise scheduler instances of the requested type. All kwargs are passed
    to the scheduler.
    """
-    from lerobot.utils.import_utils import require_package
-
    require_package("diffusers", extra="diffusion")
-    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler

    if name == "DDPM":
        return DDPMScheduler(**kwargs)
@@ -204,7 +204,9 @@ class FlowmatchingActionHead(nn.Module):
            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)

-        self.beta_dist = Beta(config.noise_beta_alpha, config.noise_beta_beta)
+        self._noise_beta_alpha = config.noise_beta_alpha
+        self._noise_beta_beta = config.noise_beta_beta
+        self._beta_dist = None
        self.num_timestep_buckets = config.num_timestep_buckets
        self.config = config
        self.set_trainable_parameters(config.tune_projector, config.tune_diffusion_model)
@@ -249,7 +251,9 @@ class FlowmatchingActionHead(nn.Module):
                self.model.eval()

    def sample_time(self, batch_size, device, dtype):
-        sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype)
+        if self._beta_dist is None:
+            self._beta_dist = Beta(self._noise_beta_alpha, self._noise_beta_beta, validate_args=False)
+        sample = self._beta_dist.sample([batch_size]).to(device, dtype=dtype)
        return (self.config.noise_s - sample) / self.config.noise_s

    def prepare_input(self, batch: dict) -> BatchFeature:
@@ -222,6 +222,13 @@ class Eagle25VLProcessor(ProcessorMixin):
                        videos=None,
                        **output_kwargs["images_kwargs"],
                    )
+                    if isinstance(image_inputs["pixel_values"], list):
+                        _pv = image_inputs["pixel_values"]
+                        if _pv and isinstance(_pv[0], list):
+                            _pv = [t for sub in _pv for t in sub]
+                        image_inputs["pixel_values"] = torch.stack(
+                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
+                        )
                    num_all_tiles = image_inputs["pixel_values"].shape[0]
                    special_placeholder = f"<image {idx_in_list + 1}>{self.image_start_token}{self.image_token * num_all_tiles * self.tokens_per_tile}{self.image_end_token}"
                    unified_frame_list.append(image_inputs)
@@ -233,6 +240,13 @@ class Eagle25VLProcessor(ProcessorMixin):
                        videos=[video_list[idx_in_list]],
                        **output_kwargs["videos_kwargs"],
                    )
+                    if isinstance(video_inputs["pixel_values"], list):
+                        _pv = video_inputs["pixel_values"]
+                        if _pv and isinstance(_pv[0], list):
+                            _pv = [t for sub in _pv for t in sub]
+                        video_inputs["pixel_values"] = torch.stack(
+                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
+                        )
                    num_all_tiles = video_inputs["pixel_values"].shape[0]
                    image_sizes = video_inputs["image_sizes"]
                    if timestamps_list is not None and -1 not in timestamps_list:
@@ -288,8 +302,18 @@ class Eagle25VLProcessor(ProcessorMixin):

        text = replace_in_text(text)
        if len(unified_frame_list) > 0:
-            pixel_values = torch.cat([frame["pixel_values"] for frame in unified_frame_list])
-            image_sizes = torch.cat([frame["image_sizes"] for frame in unified_frame_list])
+
+            def _to_tensor(v):
+                if isinstance(v, torch.Tensor):
+                    return v
+                if isinstance(v, list):
+                    if v and isinstance(v[0], list):
+                        v = [t for sub in v for t in sub]
+                    return torch.stack([t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in v])
+                return torch.as_tensor(v)
+
+            pixel_values = torch.cat([_to_tensor(frame["pixel_values"]) for frame in unified_frame_list])
+            image_sizes = torch.cat([_to_tensor(frame["image_sizes"]) for frame in unified_frame_list])
        else:
            pixel_values = None
            image_sizes = None
@@ -221,6 +221,7 @@ class GR00TN15(PreTrainedModel):
        self.action_horizon = config.action_horizon
        self.action_dim = config.action_dim
        self.compute_dtype = config.compute_dtype
+        self.post_init()

    def validate_inputs(self, inputs):
        # NOTE -- this should be handled internally by the model
@@ -43,6 +43,7 @@ from torch import Tensor

 from lerobot.configs import FeatureType, PolicyFeature
 from lerobot.utils.constants import ACTION, OBS_IMAGES
+from lerobot.utils.import_utils import require_package

 from ..pretrained import PreTrainedPolicy
 from .configuration_groot import GrootConfig
@@ -59,6 +60,7 @@ class GrootPolicy(PreTrainedPolicy):

    def __init__(self, config: GrootConfig, **kwargs):
        """Initialize Groot policy wrapper."""
+        require_package("transformers", extra="groot")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -36,7 +36,7 @@ import torch.nn.functional as F  # noqa: N812
 import torchvision
 from torch import Tensor

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _diffusers_available, _transformers_available, require_package

 from .configuration_multi_task_dit import MultiTaskDiTConfig

@@ -46,6 +46,13 @@ if TYPE_CHECKING or _transformers_available:
 else:
    CLIPTextModel = None
    CLIPVisionModel = None
+
+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+else:
+    DDIMScheduler = None
+    DDPMScheduler = None
 from lerobot.utils.constants import (
    ACTION,
    OBS_IMAGES,
@@ -65,6 +72,8 @@ class MultiTaskDiTPolicy(PreTrainedPolicy):
    name = "multi_task_dit"

    def __init__(self, config: MultiTaskDiTConfig, **kwargs):
+        require_package("transformers", extra="multi_task_dit")
+        require_package("diffusers", extra="multi_task_dit")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -643,12 +652,6 @@ class DiffusionObjective(nn.Module):
            "prediction_type": config.prediction_type,
        }

-        from lerobot.utils.import_utils import require_package
-
-        require_package("diffusers", extra="multi_task_dit")
-        from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-        from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-
        if config.noise_scheduler_type == "DDPM":
            self.noise_scheduler: DDPMScheduler | DDIMScheduler = DDPMScheduler(**scheduler_kwargs)
        elif config.noise_scheduler_type == "DDIM":
@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
@@ -947,6 +947,7 @@ class PI0Policy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
@@ -918,6 +918,7 @@ class PI05Policy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _scipy_available, _transformers_available
+from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _scipy_available:
@@ -35,7 +35,7 @@ else:
    idct = None

 if TYPE_CHECKING or _transformers_available:
-    from transformers import AutoTokenizer
+    from transformers import AutoProcessor, AutoTokenizer
    from transformers.models.auto import CONFIG_MAPPING

    from ..pi_gemma import (
@@ -44,6 +44,7 @@ if TYPE_CHECKING or _transformers_available:
    )
 else:
    CONFIG_MAPPING = None
+    AutoProcessor = None
    AutoTokenizer = None
    PiGemmaModel = None
    PaliGemmaForConditionalGenerationWithPiGemma = None
@@ -826,14 +827,14 @@ class PI0FastPolicy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
+        require_package("scipy", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config

        # Load tokenizers first
        try:
-            from transformers import AutoProcessor, AutoTokenizer
-
            # Load FAST tokenizer
            self.action_tokenizer = AutoProcessor.from_pretrained(
                config.action_tokenizer_name, trust_remote_code=True
@@ -62,6 +62,7 @@ from torch import Tensor, nn

 from lerobot.utils.constants import ACTION, OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS, OBS_STATE
 from lerobot.utils.device_utils import get_safe_dtype
+from lerobot.utils.import_utils import require_package

 from ..pretrained import PreTrainedPolicy
 from ..rtc.modeling_rtc import RTCProcessor
@@ -239,6 +240,7 @@ class SmolVLAPolicy(PreTrainedPolicy):
                    the configuration class is used.
        """

+        require_package("transformers", extra="smolvla")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -27,7 +27,7 @@ import torch.distributed as distributed
 import torch.nn.functional as F  # noqa: N812
 from einops import pack, rearrange, reduce, repeat, unpack
 from torch import einsum, nn
-from torch.cuda.amp import autocast
+from torch.amp import autocast
 from torch.optim import Optimizer

 from .configuration_vqbet import VQBeTConfig
@@ -1370,7 +1370,7 @@ class EuclideanCodebook(nn.Module):
        batch_samples = rearrange(batch_samples, "h ... d -> h (...) d")
        self.replace(batch_samples, batch_mask=expired_codes)

-    @autocast(enabled=False)
+    @autocast("cuda", enabled=False)
    def forward(self, x, sample_codebook_temp=None, mask=None, freeze_codebook=False):
        needs_codebook_dim = x.ndim < 4
        sample_codebook_temp = (
@@ -175,33 +175,36 @@ def actor_cli(cfg: TrainRLServerPipelineConfig):
    interactions_process.start()
    receive_policy_process.start()

-    act_with_policy(
-        cfg=cfg,
-        shutdown_event=shutdown_event,
-        parameters_queue=parameters_queue,
-        transitions_queue=transitions_queue,
-        interactions_queue=interactions_queue,
-    )
-    logging.info("[ACTOR] Policy process joined")
+    try:
+        act_with_policy(
+            cfg=cfg,
+            shutdown_event=shutdown_event,
+            parameters_queue=parameters_queue,
+            transitions_queue=transitions_queue,
+            interactions_queue=interactions_queue,
+        )
+        logging.info("[ACTOR] Policy loop finished")
+    except Exception:
+        logging.exception("[ACTOR] Unhandled exception in act_with_policy")
+        shutdown_event.set()
+    finally:
+        logging.info("[ACTOR] Closing queues")
+        transitions_queue.close()
+        interactions_queue.close()
+        parameters_queue.close()

-    logging.info("[ACTOR] Closing queues")
-    transitions_queue.close()
-    interactions_queue.close()
-    parameters_queue.close()
+        transitions_process.join()
+        logging.info("[ACTOR] Transitions process joined")
+        interactions_process.join()
+        logging.info("[ACTOR] Interactions process joined")
+        receive_policy_process.join()
+        logging.info("[ACTOR] Receive policy process joined")

-    transitions_process.join()
-    logging.info("[ACTOR] Transitions process joined")
-    interactions_process.join()
-    logging.info("[ACTOR] Interactions process joined")
-    receive_policy_process.join()
-    logging.info("[ACTOR] Receive policy process joined")
+        transitions_queue.cancel_join_thread()
+        interactions_queue.cancel_join_thread()
+        parameters_queue.cancel_join_thread()

-    logging.info("[ACTOR] join queues")
-    transitions_queue.cancel_join_thread()
-    interactions_queue.cancel_join_thread()
-    parameters_queue.cancel_join_thread()
-
-    logging.info("[ACTOR] queues closed")
+        logging.info("[ACTOR] Cleanup complete")


 # Core algorithm functions
@@ -15,6 +15,7 @@
 # limitations under the License.

 import functools
+import threading
 from collections.abc import Callable, Sequence
 from contextlib import suppress
 from typing import TypedDict
@@ -115,6 +116,7 @@ class ReplayBuffer:
        self.size = 0
        self.initialized = False
        self.optimize_memory = optimize_memory
+        self._lock = threading.Lock()

        # Track episode boundaries for memory optimization
        self.episode_ends = torch.zeros(capacity, dtype=torch.bool, device=storage_device)
@@ -198,68 +200,75 @@ class ReplayBuffer:
        complementary_info: dict[str, torch.Tensor] | None = None,
    ):
        """Saves a transition, ensuring tensors are stored on the designated storage device."""
-        # Initialize storage if this is the first transition
-        if not self.initialized:
-            self._initialize_storage(state=state, action=action, complementary_info=complementary_info)
+        with self._lock:
+            # Initialize storage if this is the first transition
+            if not self.initialized:
+                self._initialize_storage(state=state, action=action, complementary_info=complementary_info)

-        # Store the transition in pre-allocated tensors
-        for key in self.states:
-            self.states[key][self.position].copy_(state[key].squeeze(dim=0))
+            # Store the transition in pre-allocated tensors
+            for key in self.states:
+                self.states[key][self.position].copy_(state[key].squeeze(dim=0))

-            if not self.optimize_memory:
-                # Only store next_states if not optimizing memory
-                self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))
+                if not self.optimize_memory:
+                    # Only store next_states if not optimizing memory
+                    self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))

-        self.actions[self.position].copy_(action.squeeze(dim=0))
-        self.rewards[self.position] = reward
-        self.dones[self.position] = done
-        self.truncateds[self.position] = truncated
+            self.actions[self.position].copy_(action.squeeze(dim=0))
+            self.rewards[self.position] = reward
+            self.dones[self.position] = done
+            self.truncateds[self.position] = truncated

-        # Handle complementary_info if provided and storage is initialized
-        if complementary_info is not None and self.has_complementary_info:
-            # Store the complementary_info
-            for key in self.complementary_info_keys:
-                if key in complementary_info:
-                    value = complementary_info[key]
-                    if isinstance(value, torch.Tensor):
-                        self.complementary_info[key][self.position].copy_(value.squeeze(dim=0))
-                    elif isinstance(value, (int | float)):
-                        self.complementary_info[key][self.position] = value
+            # Handle complementary_info if provided and storage is initialized
+            if complementary_info is not None and self.has_complementary_info:
+                for key in self.complementary_info_keys:
+                    if key in complementary_info:
+                        value = complementary_info[key]
+                        if isinstance(value, torch.Tensor):
+                            self.complementary_info[key][self.position].copy_(value.squeeze(dim=0))
+                        elif isinstance(value, (int | float)):
+                            self.complementary_info[key][self.position] = value

-        self.position = (self.position + 1) % self.capacity
-        self.size = min(self.size + 1, self.capacity)
+            self.position = (self.position + 1) % self.capacity
+            self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size: int) -> BatchTransition:
        """Sample a random batch of transitions and collate them into batched tensors."""
        if not self.initialized:
            raise RuntimeError("Cannot sample from an empty buffer. Add transitions first.")

-        batch_size = min(batch_size, self.size)
-        high = max(0, self.size - 1) if self.optimize_memory and self.size < self.capacity else self.size
+        with self._lock:
+            batch_size = min(batch_size, self.size)
+            high = max(0, self.size - 1) if self.optimize_memory and self.size < self.capacity else self.size

-        # Random indices for sampling - create on the same device as storage
-        idx = torch.randint(low=0, high=high, size=(batch_size,), device=self.storage_device)
+            idx = torch.randint(low=0, high=high, size=(batch_size,), device=self.storage_device)

-        # Identify image keys that need augmentation
-        image_keys = [k for k in self.states if k.startswith(OBS_IMAGE)] if self.use_drq else []
+            image_keys = [k for k in self.states if k.startswith(OBS_IMAGE)] if self.use_drq else []

-        # Create batched state and next_state
-        batch_state = {}
-        batch_next_state = {}
+            batch_state = {}
+            batch_next_state = {}

-        # First pass: load all state tensors to target device
-        for key in self.states:
-            batch_state[key] = self.states[key][idx].to(self.device)
+            for key in self.states:
+                batch_state[key] = self.states[key][idx].to(self.device)

-            if not self.optimize_memory:
-                # Standard approach - load next_states directly
-                batch_next_state[key] = self.next_states[key][idx].to(self.device)
-            else:
-                # Memory-optimized approach - get next_state from the next index
-                next_idx = (idx + 1) % self.capacity
-                batch_next_state[key] = self.states[key][next_idx].to(self.device)
+                if not self.optimize_memory:
+                    batch_next_state[key] = self.next_states[key][idx].to(self.device)
+                else:
+                    next_idx = (idx + 1) % self.capacity
+                    batch_next_state[key] = self.states[key][next_idx].to(self.device)
+
+            # Sample other tensors
+            batch_actions = self.actions[idx].to(self.device)
+            batch_rewards = self.rewards[idx].to(self.device)
+            batch_dones = self.dones[idx].to(self.device).float()
+            batch_truncateds = self.truncateds[idx].to(self.device).float()
+
+            # Sample complementary_info if available
+            batch_complementary_info = None
+            if self.has_complementary_info:
+                batch_complementary_info = {}
+                for key in self.complementary_info_keys:
+                    batch_complementary_info[key] = self.complementary_info[key][idx].to(self.device)

-        # Apply image augmentation in a batched way if needed
        if self.use_drq and image_keys:
            # Concatenate all images from state and next_state
            all_images = []
@@ -280,19 +289,6 @@ class ReplayBuffer:
                # Next states start after the states at index (i*2+1)*batch_size and also take up batch_size slots
                batch_next_state[key] = augmented_images[(i * 2 + 1) * batch_size : (i + 1) * 2 * batch_size]

-        # Sample other tensors
-        batch_actions = self.actions[idx].to(self.device)
-        batch_rewards = self.rewards[idx].to(self.device)
-        batch_dones = self.dones[idx].to(self.device).float()
-        batch_truncateds = self.truncateds[idx].to(self.device).float()
-
-        # Sample complementary_info if available
-        batch_complementary_info = None
-        if self.has_complementary_info:
-            batch_complementary_info = {}
-            for key in self.complementary_info_keys:
-                batch_complementary_info[key] = self.complementary_info[key][idx].to(self.device)
-
        return BatchTransition(
            state=batch_state,
            action=batch_actions,
@@ -551,8 +551,8 @@ def step_env_and_process_transition(
    terminated = terminated or processed_action_transition[TransitionKey.DONE]
    truncated = truncated or processed_action_transition[TransitionKey.TRUNCATED]
    complementary_data = processed_action_transition[TransitionKey.COMPLEMENTARY_DATA].copy()
-    new_info = processed_action_transition[TransitionKey.INFO].copy()
-    new_info.update(info)
+    new_info = info.copy()
+    new_info.update(processed_action_transition[TransitionKey.INFO])

    new_transition = create_transition(
        observation=obs,
@@ -218,30 +218,33 @@ def start_learner_threads(
    )
    communication_process.start()

-    add_actor_information_and_train(
-        cfg=cfg,
-        wandb_logger=wandb_logger,
-        shutdown_event=shutdown_event,
-        transition_queue=transition_queue,
-        interaction_message_queue=interaction_message_queue,
-        parameters_queue=parameters_queue,
-    )
-    logging.info("[LEARNER] Training process stopped")
+    try:
+        add_actor_information_and_train(
+            cfg=cfg,
+            wandb_logger=wandb_logger,
+            shutdown_event=shutdown_event,
+            transition_queue=transition_queue,
+            interaction_message_queue=interaction_message_queue,
+            parameters_queue=parameters_queue,
+        )
+        logging.info("[LEARNER] Training process stopped")
+    except Exception:
+        logging.exception("[LEARNER] Unhandled exception in training loop")
+        shutdown_event.set()
+    finally:
+        logging.info("[LEARNER] Closing queues")
+        transition_queue.close()
+        interaction_message_queue.close()
+        parameters_queue.close()

-    logging.info("[LEARNER] Closing queues")
-    transition_queue.close()
-    interaction_message_queue.close()
-    parameters_queue.close()
+        communication_process.join()
+        logging.info("[LEARNER] Communication process joined")

-    communication_process.join()
-    logging.info("[LEARNER] Communication process joined")
+        transition_queue.cancel_join_thread()
+        interaction_message_queue.cancel_join_thread()
+        parameters_queue.cancel_join_thread()

-    logging.info("[LEARNER] join queues")
-    transition_queue.cancel_join_thread()
-    interaction_message_queue.cancel_join_thread()
-    parameters_queue.cancel_join_thread()
-
-    logging.info("[LEARNER] queues closed")
+        logging.info("[LEARNER] Cleanup complete")


 # Core algorithm functions
@@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, Any

 from lerobot.cameras import make_cameras_from_configs
 from lerobot.types import RobotAction, RobotObservation
-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 from ..robot import Robot
 from ..utils import ensure_safe_goal_position
@@ -81,6 +81,7 @@ class Reachy2Robot(Robot):
    name = "reachy2"

    def __init__(self, config: Reachy2RobotConfig):
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -27,7 +27,7 @@ import numpy as np

 from lerobot.cameras import make_cameras_from_configs
 from lerobot.types import RobotAction, RobotObservation
-from lerobot.utils.import_utils import _unitree_sdk_available
+from lerobot.utils.import_utils import _unitree_sdk_available, require_package

 from ..robot import Robot
 from .config_unitree_g1 import UnitreeG1Config
@@ -111,6 +111,7 @@ class UnitreeG1(Robot):
    name = "unitree_g1"

    def __init__(self, config: UnitreeG1Config):
+        require_package("unitree-sdk2py", extra="unitree_g1", import_name="unitree_sdk2py")
        super().__init__(config)

        logger.info("Initialize UnitreeG1...")
@@ -286,7 +286,7 @@ def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int):
    if len(set(num_eps_per_cam)) != 1:
        raise ValueError(f"All cams dont have same number of episodes ({num_eps_per_cam}).")

-    episods_metadata = []
+    episodes_metadata = []
    num_cameras = len(video_keys)
    num_episodes = num_eps_per_cam[0]
    for ep_idx in tqdm.tqdm(range(num_episodes), desc="convert videos"):
@@ -299,9 +299,9 @@ def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int):
        ep_dict = {}
        for cam_idx in range(num_cameras):
            ep_dict.update(eps_metadata_per_cam[cam_idx][ep_idx])
-        episods_metadata.append(ep_dict)
+        episodes_metadata.append(ep_dict)

-    return episods_metadata
+    return episodes_metadata


 def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_file_size_in_mb: int):
@@ -559,7 +559,11 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
            )

        # Load pretrained policy
-        policy = None if cfg.policy is None else make_policy(cfg.policy, ds_meta=dataset.meta)
+        policy = (
+            None
+            if cfg.policy is None
+            else make_policy(cfg.policy, ds_meta=dataset.meta, rename_map=cfg.dataset.rename_map)
+        )
        preprocessor = None
        postprocessor = None
        interpolator = None
@@ -386,7 +386,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        sampler=sampler,
        pin_memory=device.type == "cuda",
        drop_last=False,
-        prefetch_factor=2 if cfg.num_workers > 0 else None,
+        prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
+        persistent_workers=cfg.persistent_workers and cfg.num_workers > 0,
    )

    # Prepare everything with accelerator
@@ -433,6 +434,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
    for _ in range(step, cfg.steps):
        start_time = time.perf_counter()
        batch = next(dl_iter)
+        for cam_key in dataset.meta.camera_keys:
+            if cam_key in batch and batch[cam_key].dtype == torch.uint8:
+                batch[cam_key] = batch[cam_key].to(dtype=torch.float32) / 255.0
        batch = preprocessor(batch)
        train_tracker.dataloading_s = time.perf_counter() - start_time

@@ -15,9 +15,22 @@
 # limitations under the License.

 import logging
+from typing import TYPE_CHECKING
+
+from lerobot.utils.import_utils import _hidapi_available, _pygame_available, require_package

 from ..utils import TeleopEvents

+if TYPE_CHECKING or _pygame_available:
+    import pygame
+else:
+    pygame = None  # type: ignore[assignment]
+
+if TYPE_CHECKING or _hidapi_available:
+    import hid
+else:
+    hid = None  # type: ignore[assignment]
+

 class InputController:
    """Base class for input controllers that generate motion deltas."""
@@ -199,6 +212,7 @@ class GamepadController(InputController):
    """Generate motion deltas from gamepad input."""

    def __init__(self, x_step_size=1.0, y_step_size=1.0, z_step_size=1.0, deadzone=0.1):
+        require_package("pygame", extra="gamepad")
        super().__init__(x_step_size, y_step_size, z_step_size)
        self.deadzone = deadzone
        self.joystick = None
@@ -206,8 +220,6 @@ class GamepadController(InputController):

    def start(self):
        """Initialize pygame and the gamepad."""
-        import pygame
-
        pygame.init()
        pygame.joystick.init()

@@ -230,8 +242,6 @@ class GamepadController(InputController):

    def stop(self):
        """Clean up pygame resources."""
-        import pygame
-
        if pygame.joystick.get_init():
            if self.joystick:
                self.joystick.quit()
@@ -240,8 +250,6 @@ class GamepadController(InputController):

    def update(self):
        """Process pygame events to get fresh gamepad readings."""
-        import pygame
-
        for event in pygame.event.get():
            if event.type == pygame.JOYBUTTONDOWN:
                if event.button == 3:
@@ -280,8 +288,6 @@ class GamepadController(InputController):

    def get_deltas(self):
        """Get the current movement deltas from gamepad state."""
-        import pygame
-
        try:
            # Read joystick axes
            # Left stick X and Y (typically axes 0 and 1)
@@ -326,6 +332,7 @@ class GamepadControllerHID(InputController):
            z_scale: Scaling factor for Z-axis movement
            deadzone: Joystick deadzone to prevent drift
        """
+        require_package("hidapi", extra="gamepad", import_name="hid")
        super().__init__(x_step_size, y_step_size, z_step_size)
        self.deadzone = deadzone
        self.device = None
@@ -342,8 +349,6 @@ class GamepadControllerHID(InputController):

    def find_device(self):
        """Look for the gamepad device by vendor and product ID."""
-        import hid
-
        devices = hid.enumerate()
        for device in devices:
            device_name = device["product_string"]
@@ -357,8 +362,6 @@ class GamepadControllerHID(InputController):

    def start(self):
        """Connect to the gamepad using HIDAPI."""
-        import hid
-
        self.device_info = self.find_device()
        if not self.device_info:
            self.running = False
@@ -45,7 +45,7 @@ class HomunculusArm(Teleoperator):
    name = "homunculus_arm"

    def __init__(self, config: HomunculusArmConfig):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
        super().__init__(config)
        self.config = config
        self.serial = serial.Serial(config.port, config.baud_rate, timeout=1)
@@ -71,7 +71,7 @@ class HomunculusGlove(Teleoperator):
    name = "homunculus_glove"

    def __init__(self, config: HomunculusGloveConfig):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
        super().__init__(config)
        self.config = config
        self.serial = serial.Serial(config.port, config.baud_rate, timeout=1)
@@ -23,7 +23,7 @@ from typing import Any

 from lerobot.types import RobotAction
 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _pynput_available
+from lerobot.utils.import_utils import _pynput_available, require_package

 from ..teleoperator import Teleoperator
 from ..utils import TeleopEvents
@@ -56,6 +56,7 @@ class KeyboardTeleop(Teleoperator):
    name = "keyboard"

    def __init__(self, config: KeyboardTeleopConfig):
+        require_package("pynput", extra="pynput-dep")
        super().__init__(config)
        self.config = config
        self.robot_type = config.type
@@ -21,14 +21,24 @@
 import logging
 import threading
 import time
+from typing import TYPE_CHECKING

-import hebi
 import numpy as np
-from teleop import Teleop

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+from lerobot.utils.import_utils import _hebi_available, _teleop_available, require_package
 from lerobot.utils.rotation import Rotation

+if TYPE_CHECKING or _hebi_available:
+    import hebi
+else:
+    hebi = None
+
+if TYPE_CHECKING or _teleop_available:
+    from teleop import Teleop
+else:
+    Teleop = None
+
 from ..teleoperator import Teleoperator
 from .config_phone import PhoneConfig, PhoneOS

@@ -74,6 +84,8 @@ class IOSPhone(BasePhone, Teleoperator):
    name = "ios_phone"

    def __init__(self, config: PhoneConfig):
+        require_package("hebi-py", extra="phone", import_name="hebi")
+        require_package("teleop", extra="phone")
        super().__init__(config)
        self.config = config
        self._group = None
@@ -213,6 +225,8 @@ class AndroidPhone(BasePhone, Teleoperator):
    name = "android_phone"

    def __init__(self, config: PhoneConfig):
+        require_package("hebi-py", extra="phone", import_name="hebi")
+        require_package("teleop", extra="phone")
        super().__init__(config)
        self.config = config
        self._teleop = None
@@ -19,7 +19,7 @@ import logging
 import time
 from typing import TYPE_CHECKING

-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 if TYPE_CHECKING or _reachy2_sdk_available:
    from reachy2_sdk import ReachySDK
@@ -84,6 +84,7 @@ class Reachy2Teleoperator(Teleoperator):
    name = "reachy2_specific"

    def __init__(self, config: Reachy2TeleoperatorConfig):
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -34,7 +34,7 @@ from typing import TYPE_CHECKING

 import numpy as np

-from lerobot.utils.import_utils import _serial_available
+from lerobot.utils.import_utils import _serial_available, require_package

 if TYPE_CHECKING or _serial_available:
    import serial
@@ -156,6 +156,7 @@ def run_exo_calibration(
    """
    Run interactive calibration for an exoskeleton arm.
    """
+    require_package("pyserial", extra="unitree_g1", import_name="serial")
    try:
        import cv2
        import matplotlib.pyplot as plt
@@ -76,7 +76,7 @@ class ExoskeletonArm:
    calibration: ExoskeletonCalibration | None = None

    def __post_init__(self):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="unitree_g1", import_name="serial")
        if self.calibration_fpath.is_file():
            self._load_calibration()

@@ -115,6 +115,12 @@ _feetech_sdk_available = is_package_available("feetech-servo-sdk", import_name="
 _reachy2_sdk_available = is_package_available("reachy2_sdk")
 _can_available = is_package_available("python-can", "can")
 _unitree_sdk_available = is_package_available("unitree-sdk2py", "unitree_sdk2py")
+_pyrealsense2_available = is_package_available("pyrealsense2")
+_zmq_available = is_package_available("pyzmq", import_name="zmq")
+_hebi_available = is_package_available("hebi-py", import_name="hebi")
+_teleop_available = is_package_available("teleop")
+_placo_available = is_package_available("placo")
+_hidapi_available = is_package_available("hidapi", import_name="hid")

 # Data / serialization
 _pandas_available = is_package_available("pandas")
@@ -52,6 +52,9 @@ def get_policy_stats(ds_repo_id: str, policy_name: str, policy_kwargs: dict):
    )

    batch = next(iter(dataloader))
+    for key in batch:
+        if isinstance(batch[key], torch.Tensor) and batch[key].dtype == torch.uint8:
+            batch[key] = batch[key].to(dtype=torch.float32) / 255.0
    batch = preprocessor(batch)
    loss, output_dict = policy.forward(batch)

@@ -82,6 +85,9 @@ def get_policy_stats(ds_repo_id: str, policy_name: str, policy_kwargs: dict):
    # indicating padding (those ending with "_is_pad")
    dataset.reader.delta_indices = None
    batch = next(iter(dataloader))
+    for key in batch:
+        if isinstance(batch[key], torch.Tensor) and batch[key].dtype == torch.uint8:
+            batch[key] = batch[key].to(dtype=torch.float32) / 255.0
    obs = {}
    for k in batch:
        # TODO: regenerate the safetensors
@@ -454,6 +454,35 @@ def test_tmp_video_deletion(tmp_path, empty_lerobot_dataset_factory):
    )


+def test_cleanup_interrupted_episode_removes_image_temp_dirs(tmp_path, empty_lerobot_dataset_factory):
+    """Verify interrupted episode cleanup removes temporary image directories for both image and video features."""
+    features = {
+        "image": {"dtype": "image", "shape": DUMMY_CHW, "names": ["channels", "height", "width"]},
+        "video": {"dtype": "video", "shape": DUMMY_HWC, "names": ["height", "width", "channels"]},
+    }
+    ds = empty_lerobot_dataset_factory(
+        root=tmp_path / "interrupted", features=features, streaming_encoding=False
+    )
+    # Add one frame without saving episode simulating an interruption
+    ds.add_frame(
+        {
+            "image": np.random.rand(*DUMMY_CHW),
+            "video": np.random.rand(*DUMMY_HWC),
+            "task": "Dummy task",
+        }
+    )
+    img_dir = ds.writer._get_image_file_dir(0, "image")
+    vid_img_dir = ds.writer._get_image_file_dir(0, "video")
+    # Precondition: both temp dirs exist after add_frame.
+    assert img_dir.exists()
+    assert vid_img_dir.exists()
+
+    ds.writer.cleanup_interrupted_episode(episode_index=0)
+
+    assert not img_dir.exists(), "image temp dir leaked after cleanup_interrupted_episode"
+    assert not vid_img_dir.exists(), "video temp dir leaked after cleanup_interrupted_episode"
+
+
 def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory):
    """Verify temporary image directories are removed appropriately when both image and video features are present."""
    image_key = "image"
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper.
+
+These tests mock out the SAPIEN-based RoboTwin runtime (task modules +
+YAML config loader) so they run without the full RoboTwin installation
+(SAPIEN, CuRobo, mplib, asset downloads, etc.).
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch
+
+import gymnasium as gym
+import numpy as np
+import pytest
+
+from lerobot.envs.robotwin import (
+    ACTION_DIM,
+    ROBOTWIN_CAMERA_NAMES,
+    ROBOTWIN_TASKS,
+    RoboTwinEnv,
+    create_robotwin_envs,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures / helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_task_env(
+    height: int = 240,
+    width: int = 320,
+    cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES,
+) -> MagicMock:
+    """Return a mock that mimics the RoboTwin task class API.
+
+    RoboTwin's real get_obs returns
+        {"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...}
+    so the mock follows the same nested shape.
+    """
+    obs_dict = {
+        "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
+        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
+        "endpose": {},
+    }
+
+    mock = MagicMock()
+    mock.get_obs.return_value = obs_dict
+    mock.setup_demo.return_value = None
+    mock.take_action.return_value = None
+    mock.eval_success = False
+    mock.check_success.return_value = False
+    mock.close_env.return_value = None
+    return mock
+
+
+@contextmanager
+def _patch_runtime(mock_task_instance: MagicMock):
+    """Patch both the task-class loader and the YAML config loader so the
+    env can construct + reset without a real RoboTwin install."""
+    task_cls = MagicMock(return_value=mock_task_instance)
+    fake_setup = {
+        "head_camera_h": 240,
+        "head_camera_w": 320,
+        "left_embodiment_config": {},
+        "right_embodiment_config": {},
+        "left_robot_file": "",
+        "right_robot_file": "",
+        "dual_arm_embodied": True,
+        "render_freq": 0,
+        "task_name": "beat_block_hammer",
+        "task_config": "demo_clean",
+    }
+    with (
+        patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls),
+        patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup),
+    ):
+        yield
+
+
+# ---------------------------------------------------------------------------
+# RoboTwinEnv unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestRoboTwinEnv:
+    def test_observation_space_shape(self):
+        """observation_space should have the configured h×w×3 for every camera."""
+        h, w = 240, 320
+        env = RoboTwinEnv(
+            task_name="beat_block_hammer",
+            observation_height=h,
+            observation_width=w,
+            camera_names=["head_camera", "left_camera"],
+        )
+        pixels_space = env.observation_space["pixels"]
+        assert pixels_space["head_camera"].shape == (h, w, 3)
+        assert pixels_space["left_camera"].shape == (h, w, 3)
+        assert "right_camera" not in pixels_space
+
+    def test_action_space(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env.action_space.shape == (ACTION_DIM,)
+        assert env.action_space.dtype == np.float32
+
+    def test_reset_returns_correct_obs_keys(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            obs, info = env.reset()
+
+        assert "pixels" in obs
+        for cam in ROBOTWIN_CAMERA_NAMES:
+            assert cam in obs["pixels"], f"Missing camera '{cam}' in obs"
+        assert "agent_pos" in obs
+        assert obs["agent_pos"].shape == (ACTION_DIM,)
+        assert info["is_success"] is False
+
+    def test_reset_calls_setup_demo(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            env.reset(seed=42)
+        # setup_demo receives the full YAML-derived kwargs plus seed + is_test;
+        # we only assert the caller-provided bits.
+        assert mock_task.setup_demo.call_count == 1
+        call_kwargs = mock_task.setup_demo.call_args.kwargs
+        assert call_kwargs["seed"] == 42
+        assert call_kwargs["is_test"] is True
+
+    def test_step_returns_correct_types(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            obs, reward, terminated, truncated, info = env.step(action)
+
+        assert isinstance(obs, dict)
+        assert isinstance(reward, float)
+        assert isinstance(terminated, bool)
+        assert isinstance(truncated, bool)
+        assert isinstance(info, dict)
+
+    def test_step_wrong_action_shape_raises(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        bad_action = np.zeros(7, dtype=np.float32)  # wrong dim
+        with _patch_runtime(mock_task):
+            env.reset()
+            with pytest.raises(ValueError, match="Expected 1-D action"):
+                env.step(bad_action)
+
+    def test_success_terminates_episode(self):
+        mock_task = _make_mock_task_env()
+        mock_task.check_success.return_value = True
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            _, _, terminated, _, info = env.step(action)
+        assert terminated is True
+        assert info["is_success"] is True
+
+    def test_truncation_after_episode_length(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2)
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            env.step(action)  # step 1
+            _, _, _, truncated, _ = env.step(action)  # step 2 → truncated
+        assert truncated is True
+
+    def test_close_calls_close_env(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            env.reset()
+            env.close()
+        mock_task.close_env.assert_called_once()
+
+    def test_black_frame_for_missing_camera(self):
+        """If a camera key is absent from get_obs(), a black frame is returned."""
+        # Mock exposes only head_camera; we ask for both head_camera + left_camera.
+        mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",))
+        env = RoboTwinEnv(
+            task_name="beat_block_hammer",
+            camera_names=["head_camera", "left_camera"],
+            observation_height=10,
+            observation_width=10,
+        )
+        with _patch_runtime(mock_task):
+            obs, _ = env.reset()
+        assert obs["pixels"]["left_camera"].shape == (10, 10, 3)
+        assert obs["pixels"]["left_camera"].sum() == 0
+
+    def test_task_and_task_description_attributes(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env.task == "beat_block_hammer"
+        assert isinstance(env.task_description, str)
+
+    def test_deferred_init_env_is_none_before_reset(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env._env is None  # noqa: SLF001  (testing internal state)
+
+
+# ---------------------------------------------------------------------------
+# create_robotwin_envs tests
+# ---------------------------------------------------------------------------
+
+
+class TestCreateRoboTwinEnvs:
+    def test_returns_correct_structure(self):
+        mock_task = _make_mock_task_env()
+        with _patch_runtime(mock_task):
+            envs = create_robotwin_envs(
+                task="beat_block_hammer",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+        assert "beat_block_hammer" in envs
+        assert 0 in envs["beat_block_hammer"]
+        assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv)
+
+    def test_multi_task(self):
+        mock_task = _make_mock_task_env()
+        with _patch_runtime(mock_task):
+            envs = create_robotwin_envs(
+                task="beat_block_hammer,click_bell",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+        assert set(envs.keys()) == {"beat_block_hammer", "click_bell"}
+
+    def test_unknown_task_raises(self):
+        with pytest.raises(ValueError, match="Unknown RoboTwin tasks"):
+            create_robotwin_envs(
+                task="not_a_real_task",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+
+    def test_invalid_n_envs_raises(self):
+        with pytest.raises(ValueError, match="n_envs must be a positive int"):
+            create_robotwin_envs(
+                task="beat_block_hammer",
+                n_envs=0,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+
+
+# ---------------------------------------------------------------------------
+# ROBOTWIN_TASKS list
+# ---------------------------------------------------------------------------
+
+
+def test_task_list_not_empty():
+    assert len(ROBOTWIN_TASKS) >= 50
+
+
+def test_all_tasks_are_strings():
+    assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS)
+
+
+def test_no_duplicate_tasks():
+    assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
@@ -429,6 +429,67 @@ def test_set_half_turn_homings(mock_motors, dummy_motors):
    assert all(mock_motors.stubs[stub].wait_called() for stub in write_homing_stubs)


+@pytest.mark.parametrize(
+    "initial_phase, expected_phase",
+    [
+        (0b00010000, 0b00000000),  # bit 4 set - cleared
+        (0b11111111, 0b11101111),  # all bits set - bit 4 cleared, others preserved
+        (0b00000000, 0b00000000),  # bit 4 already 0 - unchanged
+    ],
+    ids=["bit4_set", "all_bits_set", "bit4_already_cleared"],
+)
+def test_configure_motors_clears_sts3215_phase_bit4(initial_phase, expected_phase, mock_motors, dummy_motors):
+    """Phase register bit 4 (angle feedback mode) must be cleared for sts3215, other bits preserved."""
+    phase_read_stubs = []
+    phase_write_stubs = []
+    for motor in dummy_motors.values():
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Return_Delay_Time"], motor.id, 0)
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Maximum_Acceleration"], motor.id, 254)
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Acceleration"], motor.id, 254)
+        phase_read_stubs.append(
+            mock_motors.build_read_stub(*STS_SMS_SERIES_CONTROL_TABLE["Phase"], motor.id, initial_phase)
+        )
+        if initial_phase != expected_phase:
+            phase_write_stubs.append(
+                mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Phase"], motor.id, expected_phase)
+            )
+
+    bus = FeetechMotorsBus(port=mock_motors.port, motors=dummy_motors)
+    bus.connect(handshake=False)
+
+    with patch.object(bus, "write", wraps=bus.write) as mock_write:
+        bus.configure_motors()
+
+    assert all(mock_motors.stubs[stub].called for stub in phase_read_stubs)
+    if initial_phase != expected_phase:  # ensure that phase is written only if it needs to be changed
+        assert all(mock_motors.stubs[stub].wait_called() for stub in phase_write_stubs)
+    else:  # If no write should be made, ensure that Phase is not written for any motor
+        write_data_names = [call.args[0] for call in mock_write.call_args_list]
+        assert "Phase" not in write_data_names
+
+
+def test_configure_motors_skips_phase_for_non_sts3215(mock_motors):
+    """Phase register must not be touched for motors other than sts3215."""
+    motors = {
+        "dummy_1": Motor(1, "sts3250", MotorNormMode.RANGE_M100_100),
+        "dummy_2": Motor(2, "sts3250", MotorNormMode.RANGE_M100_100),
+        "dummy_3": Motor(3, "sts3250", MotorNormMode.RANGE_M100_100),
+    }
+    for motor in motors.values():
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Return_Delay_Time"], motor.id, 0)
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Maximum_Acceleration"], motor.id, 254)
+        mock_motors.build_write_stub(*STS_SMS_SERIES_CONTROL_TABLE["Acceleration"], motor.id, 254)
+
+    bus = FeetechMotorsBus(port=mock_motors.port, motors=motors)
+    bus.connect(handshake=False)
+
+    with patch.object(bus, "read", wraps=bus.read) as mock_read:
+        bus.configure_motors()
+        read_data_names = [call.args[0] for call in mock_read.call_args_list]
+
+    assert "Phase" not in read_data_names
+
+
 def test_record_ranges_of_motion(mock_motors, dummy_motors):
    positions = {
        1: [351, 42, 1337],
@@ -147,6 +147,7 @@ def test_multi_task_dit_policy_forward(batch_size: int, state_dim: int, action_d
    )

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    policy.train()

    # Use preprocessor to handle tokenization
@@ -336,6 +337,7 @@ def test_multi_task_dit_policy_select_action(batch_size: int, state_dim: int, ac
    )

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    policy.eval()
    policy.reset()  # Reset queues before inference

@@ -390,6 +392,7 @@ def test_multi_task_dit_policy_diffusion_objective():
    config.validate_features()

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    policy.train()

    # Use preprocessor to handle tokenization
@@ -468,6 +471,7 @@ def test_multi_task_dit_policy_flow_matching_objective():
    config.validate_features()

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    policy.train()

    # Use preprocessor to handle tokenization
@@ -533,16 +537,12 @@ def test_multi_task_dit_policy_save_and_load(tmp_path):
    )

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    policy.eval()

-    # Get device before saving
-    device = next(policy.parameters()).device
-
    policy.save_pretrained(root)
    loaded_policy = MultiTaskDiTPolicy.from_pretrained(root, config=config)
-
-    # Explicitly move loaded_policy to the same device
-    loaded_policy.to(device)
+    loaded_policy.to(config.device)
    loaded_policy.eval()

    batch = create_train_batch(
@@ -565,10 +565,6 @@ def test_multi_task_dit_policy_save_and_load(tmp_path):
        with seeded_context(12):
            # Process batch through preprocessor
            processed_batch = preprocessor(batch)
-            # Move batch to the same device as the policy
-            for key in processed_batch:
-                if isinstance(processed_batch[key], torch.Tensor):
-                    processed_batch[key] = processed_batch[key].to(device)
            # Collect policy values before saving
            loss, _ = policy.forward(processed_batch)

@@ -608,6 +604,7 @@ def test_multi_task_dit_policy_get_optim_params():
    )

    policy = MultiTaskDiTPolicy(config=config)
+    policy.to(config.device)
    param_groups = policy.get_optim_params()

    # Should have 2 parameter groups: non-vision and vision encoder
@@ -196,6 +196,8 @@ def test_policy(ds_repo_id, env_name, env_kwargs, policy_name, policy_kwargs):

    for key in batch:
        if isinstance(batch[key], torch.Tensor):
+            if batch[key].dtype == torch.uint8:
+                batch[key] = batch[key].to(dtype=torch.float32) / 255.0
            batch[key] = batch[key].to(DEVICE, non_blocking=True)

    # Test updating the policy (and test that it does not mutate the batch)
@@ -18,6 +18,11 @@ from unittest.mock import MagicMock, patch

 import pytest

+from lerobot.utils.import_utils import is_package_available
+
+if not is_package_available("reachy2_sdk"):
+    pytest.skip("reachy2_sdk not available", allow_module_level=True)
+
 from lerobot.teleoperators.reachy2_teleoperator import (
    REACHY2_ANTENNAS_JOINTS,
    REACHY2_L_ARM_JOINTS,