fix(rl): enhance intervention handling in actor and learner

fix(rl): improve action processing for discrete and continuous actions
fix(rl): postprocess action in actor
2026-06-16 07:49:48 +00:00 · 2026-04-26 23:09:33 +02:00 · 2026-04-26 22:47:52 +02:00 · 2026-04-26 18:15:04 +02:00 · 2026-04-26 18:11:26 +02:00 · 2026-04-26 18:08:13 +02:00
112 changed files with 6986 additions and 3006 deletions
@@ -2,11 +2,6 @@

 Short, imperative summary (e.g., "fix(robots): handle None in sensor parser"). See [CONTRIBUTING.md](../CONTRIBUTING.md) for PR conventions.

-## Type / Scope
-
- **Type**: (Bug | Feature | Docs | Performance | Test | CI | Chore)
- **Scope**: (optional — name of module or package affected)
-
 ## Summary / Motivation

 - One-paragraph description of what changes and why.
@@ -19,28 +14,14 @@ Short, imperative summary (e.g., "fix(robots): handle None in sensor parser"). S

 ## What changed

- Short, concrete bullets of the modifications (files/behaviour).
+- Short, concrete bullets explaining the functional changes (how the behavior or output differs now).
 - Short note if this introduces breaking changes and migration steps.

 ## How was this tested (or how to run locally)

- Tests added: list new tests or test files.
+- Tests added: list new tests or test files. `pytest -q tests/ -k <keyword>`
 - Manual checks / dataset runs performed.
- Instructions for the reviewer
-
-Example:
-
- Ran the relevant tests:
-
-  ```bash
-  pytest -q tests/ -k <keyword>
-  ```
-
- Reproduce with a quick example or CLI (if applicable):
-
-  ```bash
-  lerobot-train --some.option=true
-  ```
+- Instructions for the reviewer for reproducing with a quick example or CLI (if applicable)

 ## Checklist (required before merge)

@@ -48,6 +29,7 @@ Example:
 - [ ] All tests pass locally (`pytest`)
 - [ ] Documentation updated
 - [ ] CI is green
+- [ ] Community Review: I have reviewed another contributor's open PR and linked it here: # (insert PR number/link)

 ## Reviewer notes

@@ -83,10 +83,13 @@ jobs:
          cache-binary: false

      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}

      # Build the benchmark-specific image. The Dockerfile separates dep-install
      # from source-copy, so code-only changes skip the slow uv-sync layer
@@ -115,7 +118,7 @@ jobs:
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=pepijn223/smolvla_libero \
+                --policy.path=lerobot/smolvla_libero \
                --env.type=libero \
                --env.task=libero_spatial \
                --eval.batch_size=1 \
@@ -144,7 +147,7 @@ jobs:
            --artifacts-dir /tmp/libero-artifacts \
            --env libero \
            --task libero_spatial \
-            --policy pepijn223/smolvla_libero
+            --policy lerobot/smolvla_libero

      - name: Upload Libero rollout video
        if: always()
@@ -238,10 +241,13 @@ jobs:
          cache-binary: false

      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}

      - name: Build MetaWorld benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
@@ -264,7 +270,7 @@ jobs:
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=pepijn223/smolvla_metaworld \
+                --policy.path=lerobot/smolvla_metaworld \
                --env.type=metaworld \
                --env.task=metaworld-push-v3 \
                --eval.batch_size=1 \
@@ -293,7 +299,7 @@ jobs:
            --artifacts-dir /tmp/metaworld-artifacts \
            --env metaworld \
            --task metaworld-push-v3 \
-            --policy pepijn223/smolvla_metaworld
+            --policy lerobot/smolvla_metaworld

      - name: Upload MetaWorld rollout video
        if: always()
@@ -311,9 +317,121 @@ jobs:
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn

-  # ── LIBERO-plus ───────────────────────────────────────────────────────────
-  libero-plus-integration-test:
-    name: LIBERO-plus — build image + 1-episode eval
+  # ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
+  # Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
+  # pytorch3d, + simulation assets (~4 GB).
+  # Build takes ~20 min on first run; subsequent runs hit the layer cache.
+  # Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
+  robotwin-integration-test:
+    name: RoboTwin 2.0 — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOTWIN_POLICY: lerobot/smolvla_robotwin
+      ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      # Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
+      # simulation assets (~4 GB). Layer cache lives in the runner's local
+      # Docker daemon — reused across re-runs on the same machine.
+      - name: Build RoboTwin 2.0 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robotwin
+          push: false
+          load: true
+          tags: lerobot-benchmark-robotwin:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robotwin
+          cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
+
+      - name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          # Named container (no --rm) so we can docker cp artifacts out.
+          docker run --name robotwin-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
+            -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
+            lerobot-benchmark-robotwin:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              cd /opt/robotwin && lerobot-eval \
+                --policy.path=\"\$ROBOTWIN_POLICY\" \
+                --env.type=robotwin \
+                --env.task=\"\$ROBOTWIN_TASKS\" \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python /lerobot/scripts/ci/extract_task_descriptions.py \
+                --env robotwin \
+                --task \"\$ROBOTWIN_TASKS\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboTwin artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robotwin-artifacts
+          docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
+          docker rm -f robotwin-eval || true
+
+      - name: Parse RoboTwin eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robotwin-artifacts \
+            --env robotwin \
+            --task "${ROBOTWIN_TASKS}" \
+            --policy "${ROBOTWIN_POLICY}"
+
+      - name: Upload RoboTwin rollout video
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-rollout-video
+          path: /tmp/robotwin-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboTwin eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-metrics
+          path: /tmp/robotwin-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOCASA365 ──────────────────────────────────────────────────────────
+  # Isolated image: robocasa + robosuite installed manually as editable
+  # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
+  # `lerobot==0.3.3`, which would shadow this repo's lerobot).
+  robocasa-integration-test:
+    name: RoboCasa365 — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
@@ -330,6 +448,328 @@ jobs:
        with:
          cache-binary: false

+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboCasa365 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocasa
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocasa:ci
+
+      - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robocasa-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e MUJOCO_GL=egl \
+            lerobot-benchmark-robocasa:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robocasa \
+                --env.type=robocasa \
+                --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robocasa \
+                --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboCasa365 artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocasa-artifacts
+          docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
+          docker rm -f robocasa-eval || true
+
+      - name: Parse RoboCasa365 eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocasa-artifacts \
+            --env robocasa \
+            --task atomic_smoke_10 \
+            --policy lerobot/smolvla_robocasa
+
+      - name: Upload RoboCasa365 rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-rollout-video
+          path: /tmp/robocasa-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCasa365 eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-metrics
+          path: /tmp/robocasa-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOCEREBRA ───────────────────────────────────────────────────────────
+  # Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera
+  # defaults (image/wrist_image). The image is layered on
+  # huggingface/lerobot-gpu, which already ships [libero] as part of [all].
+  robocerebra-integration-test:
+    name: RoboCerebra — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboCerebra benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocerebra
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocerebra:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
+          cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
+
+      - name: Run RoboCerebra smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robocerebra-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e LIBERO_DATA_FOLDER=/tmp/libero_data \
+            lerobot-benchmark-robocerebra:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robocerebra \
+                --env.type=libero \
+                --env.task=libero_10 \
+                --env.fps=20 \
+                --env.obs_type=pixels_agent_pos \
+                --env.observation_height=256 \
+                --env.observation_width=256 \
+                '--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero --task libero_10 \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboCerebra artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocerebra-artifacts
+          docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
+          docker rm -f robocerebra-eval || true
+
+      - name: Parse RoboCerebra eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocerebra-artifacts \
+            --env robocerebra \
+            --task libero_10 \
+            --policy lerobot/smolvla_robocerebra
+
+      - name: Upload RoboCerebra rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocerebra-rollout-video
+          path: /tmp/robocerebra-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCerebra eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocerebra-metrics
+          path: /tmp/robocerebra-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOMME ───────────────────────────────────────────────────────────────
+  # Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy
+  # overrides (robomme can't be a pyproject extra due to numpy<2 pin).
+  robomme-integration-test:
+    name: RoboMME — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOMME_POLICY: lerobot/smolvla_robomme
+      ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboMME benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robomme
+          push: false
+          load: true
+          tags: lerobot-benchmark-robomme:ci
+
+      - name: Run RoboMME smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robomme-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e ROBOMME_POLICY="${ROBOMME_POLICY}" \
+            -e ROBOMME_TASKS="${ROBOMME_TASKS}" \
+            lerobot-benchmark-robomme:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=\"\$ROBOMME_POLICY\" \
+                --env.type=robomme \
+                --env.task=\"\$ROBOMME_TASKS\" \
+                --env.dataset_split=test \
+                --env.task_ids=[0] \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=3 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robomme --task \"\$ROBOMME_TASKS\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboMME artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robomme-artifacts
+          docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
+          docker rm -f robomme-eval || true
+
+      - name: Parse RoboMME eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robomme-artifacts \
+            --env robomme \
+            --task "${ROBOMME_TASKS}" \
+            --policy "${ROBOMME_POLICY}"
+
+      - name: Upload RoboMME rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-rollout-video
+          path: /tmp/robomme-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboMME eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-metrics
+          path: /tmp/robomme-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── LIBERO-plus ───────────────────────────────────────────────────────────
+  # Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of
+  # huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus).
+  libero-plus-integration-test:
+    name: LIBERO-plus — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      LIBERO_PLUS_SUITE: libero_spatial
+      LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus
+      LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]"
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
      - name: Build LIBERO-plus benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
@@ -349,14 +789,17 @@ jobs:
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \
+            -e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \
+            -e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \
            lerobot-benchmark-libero-plus:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=lerobot/smolvla_libero_plus \
+                --policy.path=\"\$LIBERO_PLUS_POLICY\" \
                --env.type=libero_plus \
-                --env.task=libero_spatial \
-                '--env.task_ids=[0,100,260,500,1000,1500,2000,2400]' \
+                --env.task=\"\$LIBERO_PLUS_SUITE\" \
+                --env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -365,7 +808,7 @@ jobs:
                --policy.empty_cameras=1 \
                --output_dir=/tmp/eval-artifacts
              python scripts/ci/extract_task_descriptions.py \
-                --env libero_plus --task libero_spatial \
+                --env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \
                --output /tmp/eval-artifacts/task_descriptions.json
            "

@@ -382,8 +825,8 @@ jobs:
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/libero-plus-artifacts \
            --env libero_plus \
-            --task libero_spatial \
-            --policy lerobot/smolvla_libero_plus
+            --task "${LIBERO_PLUS_SUITE}" \
+            --policy "${LIBERO_PLUS_POLICY}"

      - name: Upload LIBERO-plus rollout video
        if: always()
@@ -401,16 +844,17 @@ jobs:
          path: /tmp/libero-plus-artifacts/metrics.json
          if-no-files-found: warn

-  # ── ROBOMME ───────────────────────────────────────────────────────────────
-  robomme-integration-test:
-    name: RoboMME — build image + 1-episode eval
+  # ── VLABENCH ─────────────────────────────────────────────────────────────
+  # Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain)
+  vlabench-integration-test:
+    name: VLABench — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true
@@ -420,71 +864,82 @@ jobs:
        with:
          cache-binary: false

-      - name: Build RoboMME benchmark image
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build VLABench benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
          context: .
-          file: docker/Dockerfile.benchmark.robomme
+          file: docker/Dockerfile.benchmark.vlabench
          push: false
          load: true
-          tags: lerobot-benchmark-robomme:ci
+          tags: lerobot-benchmark-vlabench:ci
+          build-args: |
+            VLABENCH_ASSETS_REPO=lerobot/vlabench-assets

-      - name: Run RoboMME smoke eval (1 episode)
+      - name: Run VLABench smoke eval (10 tasks, 1 episode each)
        if: env.HF_USER_TOKEN != ''
        run: |
-          docker run --name robomme-eval --gpus all \
+          docker run --name vlabench-eval --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-            lerobot-benchmark-robomme:ci \
+            -e MUJOCO_GL=egl \
+            lerobot-benchmark-vlabench:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=lerobot/smolvla_robomme \
-                --env.type=robomme \
-                --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
-                --env.dataset_split=test \
+                --policy.path=lerobot/smolvla_vlabench \
+                --env.type=vlabench \
+                --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
                --policy.device=cuda \
-                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
-                --policy.empty_cameras=3 \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \
                --output_dir=/tmp/eval-artifacts
              python scripts/ci/extract_task_descriptions.py \
-                --env robomme --task PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+                --env vlabench \
+                --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
                --output /tmp/eval-artifacts/task_descriptions.json
            "

-      - name: Copy RoboMME artifacts from container
+      - name: Copy VLABench artifacts from container
        if: always()
        run: |
-          mkdir -p /tmp/robomme-artifacts
-          docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
-          docker rm -f robomme-eval || true
+          mkdir -p /tmp/vlabench-artifacts
+          docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true
+          docker rm -f vlabench-eval || true

-      - name: Parse RoboMME eval metrics
+      - name: Parse VLABench eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
-            --artifacts-dir /tmp/robomme-artifacts \
-            --env robomme \
-            --task PickXtimes \
-            --policy lerobot/smolvla_robomme
+            --artifacts-dir /tmp/vlabench-artifacts \
+            --env vlabench \
+            --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+            --policy lerobot/smolvla_vlabench

-      - name: Upload RoboMME rollout video
+      - name: Upload VLABench rollout video
        if: always()
        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
        with:
-          name: robomme-rollout-video
-          path: /tmp/robomme-artifacts/videos/
+          name: vlabench-rollout-video
+          path: /tmp/vlabench-artifacts/videos/
          if-no-files-found: warn

-      - name: Upload RoboMME eval metrics
+      - name: Upload VLABench eval metrics
        if: always()
        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
        with:
-          name: robomme-metrics
-          path: /tmp/robomme-artifacts/metrics.json
+          name: vlabench-metrics
+          path: /tmp/vlabench-artifacts/metrics.json
          if-no-files-found: warn
@@ -33,7 +33,7 @@ jobs:
      github.event.workflow_run.event == 'pull_request' &&
      github.event.workflow_run.conclusion == 'success' &&
      github.repository == 'huggingface/lerobot'
-    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6  # main
    with:
      package_name: lerobot
    secrets:
@@ -217,6 +217,24 @@ jobs:
      - name: Run end-to-end tests
        run: make test-end-to-end

+  slack-notification:
+    name: Slack Notification
+    needs: [cpu-tests, gpu-tests, upgrade-lock]
+    if: always() && needs.upgrade-lock.outputs.changed == 'true'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    env:
+      CI_SLACK_CHANNEL: ${{ secrets.CI_SLACK_CHANNEL }}
+    steps:
+      - name: Post to a Slack channel
+        uses: huggingface/hf-workflows/.github/actions/post-slack@a88e7fa2eaee28de5a4d6142381b1fb792349b67  # main
+        with:
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: "Results of the latest dependency tests (CPU + GPU)"
+          status: ${{ (needs.cpu-tests.result == 'success' && needs.gpu-tests.result == 'success') && 'success' || 'failure' }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  # This job creates or updates a PR with the upgraded lockfile
  open-pr:
    name: Open PR
@@ -1,5 +1,7 @@
 This file provides guidance to AI agents when working with code in this repository.

+> **User-facing help → [`AGENT_GUIDE.md`](./AGENT_GUIDE.md)** (SO-101 setup, recording, picking a policy, training duration, eval — with copy-pasteable commands).
+
 ## Project Overview

 LeRobot is a PyTorch-based library for real-world robotics, providing datasets, pretrained policies, and tools for training, evaluation, data collection, and robot control. It integrates with Hugging Face Hub for model/dataset sharing.
@@ -0,0 +1,410 @@
+# AGENT_GUIDE.md — LeRobot Helper for AI Agents & Users
+
+This file is a practical, copy-paste-friendly companion for any AI agent (Cursor, Claude, ChatGPT, Codex, etc.) helping a user work with LeRobot. It complements [`AGENTS.md`](./AGENTS.md) (dev/contributor context) with **user-facing guidance**: how to start, what to train, how long, how to record, and how to calibrate an SO-101.
+
+---
+
+## 1. Start here — ask the user first (MANDATORY)
+
+Before suggesting any command, an agent MUST ask the user at least these questions and wait for answers:
+
+1. **What's your goal?** (e.g. "teach my SO-101 to fold a cloth", "train a policy on an existing HF dataset", "contribute a PR", "understand the codebase")
+2. **What hardware do you have?**
+   - Robot: none / SO-100 / SO-101 / Koch / LeKiwi / Reachy / other
+   - Teleop: leader arm / phone / keyboard / gamepad / none
+   - Cameras: how many, resolution, fixed or moving?
+3. **What machine will you train on?**
+   - GPU model + VRAM (e.g. "laptop 3060 6 GB", "RTX 4090 24 GB", "A100 80 GB", "CPU only")
+   - OS: macOS / Linux / Windows
+4. **Skill level & time budget?** First time, some ML, experienced? Hours, days, a weekend?
+5. **Do you already have a dataset?** Yes (HF repo id?) / no / want to record one
+6. **How can I help right now?** (pick one concrete next step)
+
+Only after you have answers, propose a concrete path. If something is ambiguous, ask again rather than guessing. Bias toward **the simplest thing that works** for the user's hardware and goal.
+
+---
+
+## 2. LeRobot in 60 seconds
+
+LeRobot = **datasets + policies + envs + robot control**, unified by a small set of strong abstractions.
+
+- **`LeRobotDataset`** — episode-aware dataset (video or images + actions + state), loadable from the Hub or disk.
+- **Policies** (`ACT`, `Diffusion`, `SmolVLA`, `π0`, `π0.5`, `Wall-X`, `X-VLA`, `VQ-BeT`, `TD-MPC`, …) — all inherit `PreTrainedPolicy` and can be pushed/pulled from the Hub.
+- **Processors** — small composable transforms between dataset → policy → robot.
+- **Envs** (sim) and **Robots** (real) — same action/observation contract so code swaps cleanly.
+- **CLI** — `lerobot-record`, `lerobot-train`, `lerobot-eval`, `lerobot-teleoperate`, `lerobot-calibrate`, `lerobot-find-port`, `lerobot-setup-motors`, `lerobot-replay`.
+
+See [`AGENTS.md`](./AGENTS.md) for repo architecture.
+
+---
+
+## 3. Quickstart paths (pick one)
+
+### Path A — "I have an SO-101 and want my first trained policy"
+
+Go to §4 (SO-101 end-to-end), then §5 (data tips), then §6 (pick a policy — likely **ACT**), then §7 (how long), then §8 (eval).
+
+### Path B — "No hardware, I want to train on an existing dataset"
+
+Skip §4. Pick a policy in §6, pick a duration in §7, then run `lerobot-train` per §4.9 with a Hub `--dataset.repo_id` and an `--env.type` for eval. Finish with §8.
+
+### Path C — "I just want to understand the codebase"
+
+Read §2 above, then `AGENTS.md` "Architecture", then open `src/lerobot/policies/act/` and `src/lerobot/datasets/lerobot_dataset.py` as canonical examples.
+
+---
+
+## 4. SO-101 end-to-end cheat-sheet
+
+Full details in [`docs/source/so101.mdx`](./docs/source/so101.mdx) and [`docs/source/il_robots.mdx`](./docs/source/il_robots.mdx). Minimum commands in order. Confirm arms are assembled + powered before issuing.
+
+**4.1 Install**
+
+```bash
+pip install 'lerobot[feetech]'              # SO-100/SO-101 motor stack
+# pip install 'lerobot[all]'                # everything
+# pip install 'lerobot[aloha,pusht]'        # specific features
+# pip install 'lerobot[smolvla]'            # add SmolVLA deps
+git lfs install && git lfs pull
+hf auth login                               # required to push datasets/policies
+```
+
+Contributors can alternatively use `uv sync --locked --extra feetech` (see `AGENTS.md`).
+
+**4.2 Find USB ports** — run once per arm, unplug when prompted.
+
+```bash
+lerobot-find-port
+```
+
+macOS: `/dev/tty.usbmodem...`; Linux: `/dev/ttyACM0` (may need `sudo chmod 666 /dev/ttyACM0`).
+
+**4.3 Setup motor IDs & baudrate** (one-time, per arm)
+
+```bash
+lerobot-setup-motors --robot.type=so101_follower --robot.port=<FOLLOWER_PORT>
+lerobot-setup-motors --teleop.type=so101_leader  --teleop.port=<LEADER_PORT>
+```
+
+**4.4 Calibrate** — center all joints, press Enter, sweep each joint through its full range. The `id` is the calibration key — reuse it everywhere.
+
+```bash
+lerobot-calibrate --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower
+lerobot-calibrate --teleop.type=so101_leader  --teleop.port=<LEADER_PORT>   --teleop.id=my_leader
+```
+
+**4.5 Teleoperate** (sanity check, no recording)
+
+```bash
+lerobot-teleoperate \
+  --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
+  --teleop.type=so101_leader  --teleop.port=<LEADER_PORT>  --teleop.id=my_leader \
+  --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+  --display_data=true
+```
+
+> **Feetech timeout / comms error on SO-100 / SO-101?** Before touching software, check the **red motor LEDs** on the daisy chain.
+>
+> - **All steady red, gripper → base chain** → wiring OK.
+> - **One or more motors dark / chain stops mid-way** → wiring issue: reseat the 3-pin cables, check the controller-board power supply, and make sure each motor is fully clicked in.
+> - **LEDs blinking** → the motor is in an **error state**: usually overload (forcing a joint past its limit) **or wrong power supply voltage**. SO-100 / SO-101 ship in two variants — a **5 V / 7.4 V** build and a **12 V** build — they are NOT interchangeable. Using a 12 V PSU on a 5 V / 7.4 V arm (or vice-versa) will trip this error; confirm your motor variant before powering up.
+>
+> Most "timeout" errors are physical, not code.
+
+**4.6 Record a dataset** — keys: **→** next, **←** redo, **ESC** finish & upload.
+
+```bash
+HF_USER=$(NO_COLOR=1 hf auth whoami | awk -F': *' 'NR==1 {print $2}')
+
+lerobot-record \
+  --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
+  --teleop.type=so101_leader  --teleop.port=<LEADER_PORT>  --teleop.id=my_leader \
+  --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+  --dataset.repo_id=${HF_USER}/my_task \
+  --dataset.single_task="<describe the task in one sentence>" \
+  --dataset.num_episodes=50 \
+  --dataset.episode_time_s=30 \
+  --dataset.reset_time_s=10 \
+  --display_data=true
+```
+
+**4.7 Visualize** — **always** do this before training. Look for missing frames, camera blur, unreachable targets, inconsistent object positions.
+After upload: https://huggingface.co/spaces/lerobot/visualize_dataset → paste `${HF_USER}/my_task`. Works for **any LeRobot-formatted Hub dataset** — use it to scout other datasets, inspect episode quality, or debug your own data before retraining.
+
+**4.8 Replay an episode** (sanity check)
+
+```bash
+lerobot-replay --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
+  --dataset.repo_id=${HF_USER}/my_task --dataset.episode=0
+```
+
+**4.9 Train** (default: ACT — fastest, lowest memory). Apple silicon: `--policy.device=mps`. See §6/§7 for policy and duration.
+
+```bash
+lerobot-train \
+  --dataset.repo_id=${HF_USER}/my_task \
+  --policy.type=act \
+  --policy.device=cuda \
+  --output_dir=outputs/train/act_my_task \
+  --job_name=act_my_task \
+  --batch_size=8 \
+  --wandb.enable=true \
+  --policy.repo_id=${HF_USER}/act_my_task
+```
+
+**4.10 Evaluate on the real robot** — compare success rate to a teleoperated baseline.
+
+```bash
+lerobot-record \
+  --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
+  --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+  --dataset.repo_id=${HF_USER}/eval_my_task \
+  --dataset.single_task="<same task description as training>" \
+  --dataset.num_episodes=10 \
+  --policy.path=${HF_USER}/act_my_task
+```
+
+---
+
+## 5. Data collection tips (beginner → reliable policy)
+
+Good data beats clever models. Adopt these defaults and deviate only with evidence.
+
+### 5.1 Setup & ergonomics
+
+- **Fix the rig and cameras** before touching the software. If the rig vibrates or the operator gets frustrated, fix that first — more bad data won't help.
+- **Lighting matters more than resolution.** Diffuse, consistent light. Avoid moving shadows.
+- **"Can you do the task from the camera view alone?"** If no, your cameras are wrong. Fix before recording.
+- Enable **action interpolation** for rollouts when available for smoother trajectories.
+
+### 5.2 Practice before you record
+
+- Do 5–10 demos without recording. Build a deliberate, repeatable strategy.
+- Hesitant or inconsistent demos teach the model hesitation.
+
+### 5.3 Quality over speed
+
+Deliberate, high-quality execution beats fast sloppy runs. Optimize for speed only **after** strategy is dialed in — never trade quality for it.
+
+### 5.4 Consistency within and across episodes
+
+Same grasp, approach vector, and timing. Coherent strategies are much easier to learn than wildly varying movements.
+
+### 5.5 Start small, then extend (the golden rule)
+
+- **First 50 episodes = constrained version** of the task: one object, fixed position, fixed camera setup, one operator.
+- Train a quick ACT model. See what fails.
+- **Then add diversity** along one axis at a time: more positions → more lighting → more objects → more operators.
+- Don't try to collect the "perfect dataset" on day one. Iterate.
+
+### 5.6 Policy choice for beginners
+
+- **Laptop / first time / want results fast → ACT.** Works surprisingly well, trains fast even on a laptop GPU.
+- **Bigger GPU / language-conditioned / multi-task → SmolVLA.** Unfreezing the vision encoder (see §7) is a big win here.
+- Defer π0 / π0.5 / Wall-X / X-VLA until you have a proven ACT baseline and a 20+ GB GPU.
+
+### 5.7 Recommended defaults for your first task
+
+| Setting          | Value                                                                                                                                                 |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Episodes         | **50** to start, scale to 100–300 after first training                                                                                                |
+| Episode length   | 20–45 s (shorter is fine for grasp/place)                                                                                                             |
+| Reset time       | 10 s                                                                                                                                                  |
+| FPS              | 30                                                                                                                                                    |
+| Cameras          | **2 cameras recommended**: 1 fixed front + 1 wrist. Multi-view often outperforms single-view. A single fixed camera also works to keep things simple. |
+| Task description | Short, specific, action-phrased sentence                                                                                                              |
+
+### 5.8 Troubleshooting signal
+
+- Policy fails at one specific stage → record 10–20 more episodes **targeting that stage**.
+- Policy flaps / oscillates → likely inconsistent demos, or need more training; re-record worst episodes (use **←** to redo).
+- Policy ignores the object → camera framing or lighting issue, not a model issue.
+
+See also: [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets#what-makes-a-good-dataset).
+
+---
+
+## 6. Which policy should I train?
+
+Match the policy to the user's **GPU memory** and **time budget**. Numbers below come from an internal profiling run (one training update per policy). They are **indicative only** — see caveats.
+
+### 6.1 Profiling snapshot (indicative)
+
+All policies typically train for **5–10 epochs** (see §7).
+
+| Policy      | Batch | Update (ms) | Peak GPU mem (GB) | Best for                                                                                         |
+| ----------- | ----: | ----------: | ----------------: | ------------------------------------------------------------------------------------------------ |
+| `act`       |     4 |    **83.9** |          **0.94** | First-time users, laptops, single-task. Fast and reliable.                                       |
+| `diffusion` |     4 |       168.6 |              4.94 | Multi-modal action distributions; needs mid-range GPU.                                           |
+| `smolvla`   |     1 |       357.8 |              3.93 | Language-conditioned, multi-task, small VLA. **Unfreeze vision encoder for big gains** (see §7). |
+| `xvla`      |     1 |       731.6 |             15.52 | Large VLA, multi-task.                                                                           |
+| `wall_x`    |     1 |       716.5 |             15.95 | Large VLA with world-model objective.                                                            |
+| `pi0`       |     1 |       940.3 |             15.50 | Strong large VLA baseline (Physical Intelligence).                                               |
+| `pi05`      |     1 |      1055.8 |             16.35 | Newer π policy; similar footprint to `pi0`.                                                      |
+
+**Critical caveats:**
+
+- **Optimizer:** measured with **SGD**. LeRobot's default is **AdamW**, which keeps extra optimizer state → **peak memory will be noticeably higher** with the default, especially for `pi0`, `pi05`, `wall_x`, `xvla`.
+- **Batch size:** the large policies were profiled at batch 1. In practice use a **larger batch** for stable training (see §7.4). Memory scales roughly linearly with batch.
+
+### 6.2 Decision rules
+
+- **< 8 GB VRAM (laptop, 3060, M-series Mac):** → `act`. Maybe `diffusion` if you have ~6–8 GB free.
+- **12–16 GB VRAM (4070/4080, A4000):** → `smolvla` with defaults, or `act`/`diffusion` with larger batch. `pi0`/`pi05`/`wall_x`/`xvla` feasible only with small batch + gradient accumulation.
+- **24+ GB VRAM (3090/4090/A5000):** → any policy. Prefer `smolvla` (unfrozen) for multi-task; `act` for single-task grasp-and-place (still often the best ROI). Could experiment with `pi0` or `pi05` or `xvla`
+- **80 GB (A100/H100):** → any, with healthy batch. `pi05`, `xvla`, `wall_x` become comfortable.
+- **CPU only:** → don't train here. Use Google Colab (see [`docs/source/notebooks.mdx`](./docs/source/notebooks.mdx)) or a rented GPU.
+
+---
+
+## 7. How long should I train?
+
+Robotics imitation learning usually converges in a **few epochs over the dataset**, not hundreds of thousands of raw steps. Think **epochs first**, then translate to steps.
+
+### 7.1 Rule of thumb
+
+- **Typical total: 5–10 epochs.** Start at 5, eval, then decide if more helps.
+- Very small datasets (< 30 episodes) may want slightly more epochs — but first, **collect more data**.
+- VLAs with a pretrained vision backbone typically need **fewer** epochs than training from scratch.
+
+### 7.2 Steps ↔ epochs conversion
+
+```
+total_frames     = sum of frames over all episodes      # e.g. 50 eps × 30 fps × 30 s ≈ 45,000
+steps_per_epoch  = ceil(total_frames / batch_size)
+total_steps      = epochs × steps_per_epoch
+```
+
+Examples for `--batch_size=8`:
+
+| Dataset size            |  Frames | Steps / epoch | 5 epochs | 10 epochs |
+| ----------------------- | ------: | ------------: | -------: | --------: |
+| 50 eps × 30 s @ 30 fps  |  45,000 |        ~5,625 |      28k |       56k |
+| 100 eps × 30 s @ 30 fps |  90,000 |       ~11,250 |      56k |      113k |
+| 300 eps × 30 s @ 30 fps | 270,000 |       ~33,750 |     169k |      338k |
+
+Pass the resulting total with `--steps=<N>`; eval at intermediate checkpoints (`outputs/train/.../checkpoints/`).
+
+### 7.3 Per-policy starting points (single-task, ~50 episodes)
+
+| Policy         | Batch | Steps (first run) | Notes                                                             |
+| -------------- | ----: | ----------------: | ----------------------------------------------------------------- |
+| `act`          |  8–16 |           30k–80k | Usually converges under 50k for single-task.                      |
+| `diffusion`    |  8–16 |          80k–150k | Benefits from longer training than ACT.                           |
+| `smolvla`      |   4–8 |           30k–80k | Pretrained VLM → converges fast.                                  |
+| `pi0` / `pi05` |   1–4 |           30k–80k | Memory-bound; use gradient accumulation for effective batch ≥ 16! |
+
+### 7.4 Batch size guidance
+
+- **Bigger batch is preferable** for stable gradients on teleop data.
+- If GPU memory is the bottleneck, use **gradient accumulation** to raise _effective_ batch without raising peak memory.
+- Scale **learning rate** gently with batch; most LeRobot defaults work fine for a 2–4× batch change.
+
+### 7.5 Scale LR schedule & checkpoints with `--steps`
+
+LeRobot's default schedulers (e.g. SmolVLA's cosine decay) use `scheduler_decay_steps=30_000`, which is sized for long training runs. When you shorten training (e.g. 5k–10k steps on a small dataset), **scale the scheduler down to match** — otherwise the LR stays near the peak and never decays. Same for checkpoint frequency.
+
+```bash
+lerobot-train ... \
+  --steps=5000 \
+  --policy.scheduler_decay_steps=5000 \
+  --save_freq=5000
+```
+
+Rule of thumb: set `scheduler_decay_steps ≈ steps`, and `save_freq` to whatever granularity you want for eval (e.g. every 1k–5k steps). Match `scheduler_warmup_steps` proportionally if your run is very short.
+
+### 7.6 SmolVLA: unfreeze the vision encoder for real gains
+
+SmolVLA ships with `freeze_vision_encoder=True`. Unfreezing usually **improves performance substantially** on specialized tasks, at the cost of more VRAM and slower steps. Enable with:
+
+```bash
+lerobot-train ... --policy.type=smolvla \
+  --policy.freeze_vision_encoder=false \
+  --policy.train_expert_only=false
+```
+
+### 7.7 Signals to stop / keep going
+
+- Train loss plateaus → stop, save a Hub checkpoint.
+- Train loss still dropping and you're under 10 epochs → keep going.
+
+---
+
+## 8. Evaluation & benchmarks
+
+Two flavors of evaluation:
+
+### 8.1 Real-robot eval (SO-101, etc.)
+
+Reuse `lerobot-record` with `--policy.path` to run the trained policy on-robot and save the run as an eval dataset. Convention: prefix the dataset with `eval_`.
+
+```bash
+lerobot-record \
+  --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
+  --robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+  --dataset.repo_id=${HF_USER}/eval_my_task \
+  --dataset.single_task="<same task description used during training>" \
+  --dataset.num_episodes=10 \
+  --policy.path=${HF_USER}/act_my_task
+```
+
+Report success rate across episodes. Compare to a teleoperated baseline and to an earlier checkpoint to catch regressions.
+
+### 8.2 Sim-benchmark eval
+
+For policies trained on sim datasets (PushT, Aloha, LIBERO, MetaWorld, RoboCasa, …) use `lerobot-eval` against the matching `env.type`:
+
+```bash
+lerobot-eval \
+  --policy.path=${HF_USER}/diffusion_pusht \
+  --env.type=pusht \
+  --eval.n_episodes=50 \
+  --eval.batch_size=10 \
+  --policy.device=cuda
+```
+
+- Use `--policy.path=outputs/train/.../checkpoints/<step>/pretrained_model` for local checkpoints.
+- `--eval.n_episodes` should be ≥ 50 for a stable success-rate estimate.
+- Available envs live in `src/lerobot/envs/`. See [`docs/source/libero.mdx`](./docs/source/libero.mdx), [`metaworld.mdx`](./docs/source/metaworld.mdx), [`robocasa.mdx`](./docs/source/robocasa.mdx), [`vlabench.mdx`](./docs/source/vlabench.mdx) for specific benchmarks.
+- To add a new benchmark, see [`docs/source/adding_benchmarks.mdx`](./docs/source/adding_benchmarks.mdx) and [`envhub.mdx`](./docs/source/envhub.mdx).
+
+### 8.2b Dockerfiles for benchmark eval
+
+Benchmark envs have native dependencies that are painful to install locally. The repo ships **pre-baked Dockerfiles** for each supported benchmark — use these to run `lerobot-eval` in a reproducible environment:
+
+| Benchmark   | Dockerfile                                                                             |
+| ----------- | -------------------------------------------------------------------------------------- |
+| LIBERO      | [`docker/Dockerfile.benchmark.libero`](./docker/Dockerfile.benchmark.libero)           |
+| LIBERO+     | [`docker/Dockerfile.benchmark.libero_plus`](./docker/Dockerfile.benchmark.libero_plus) |
+| MetaWorld   | [`docker/Dockerfile.benchmark.metaworld`](./docker/Dockerfile.benchmark.metaworld)     |
+| RoboCasa    | [`docker/Dockerfile.benchmark.robocasa`](./docker/Dockerfile.benchmark.robocasa)       |
+| RoboCerebra | [`docker/Dockerfile.benchmark.robocerebra`](./docker/Dockerfile.benchmark.robocerebra) |
+| RoboMME     | [`docker/Dockerfile.benchmark.robomme`](./docker/Dockerfile.benchmark.robomme)         |
+| RoboTwin    | [`docker/Dockerfile.benchmark.robotwin`](./docker/Dockerfile.benchmark.robotwin)       |
+| VLABench    | [`docker/Dockerfile.benchmark.vlabench`](./docker/Dockerfile.benchmark.vlabench)       |
+
+Build and run (adapt to your benchmark):
+
+```bash
+docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-bench-robomme .
+docker run --gpus all --rm -it \
+  -v $HOME/.cache/huggingface:/root/.cache/huggingface \
+  lerobot-bench-robomme \
+  lerobot-eval --policy.path=<your_policy> --env.type=<env> --eval.n_episodes=50
+```
+
+See [`docker/README.md`](./docker/README.md) for base-image details.
+
+### 8.3 Target success rates
+
+Single-task grasp-and-place with 50 clean episodes: ACT should reach **> 70% success** on the training configuration. Less → data problem (see §5), not model problem. Expect a drop when generalizing to new positions — scale episodes or diversity to recover.
+
+---
+
+## 9. Further reading & resources
+
+- **Getting started:** [`installation.mdx`](./docs/source/installation.mdx) · [`il_robots.mdx`](./docs/source/il_robots.mdx) · [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets)
+- **Per-policy docs:** browse [`docs/source/*.mdx`](./docs/source/) (policies, hardware, benchmarks, advanced training).
+- **Community:** [Discord](https://discord.com/invite/s3KuuzsPFb) · [Hub `LeRobot` tag](https://huggingface.co/datasets?other=LeRobot) · [Dataset visualizer](https://huggingface.co/spaces/lerobot/visualize_dataset)
+
+> Keep this file current. If you learn a rule that would prevent a class of user mistakes, add it here and in [`AGENTS.md`](./AGENTS.md).
@@ -78,6 +78,9 @@ Use the templates for required fields and examples.
 - **Issues:** Follow the [ticket template](https://github.com/huggingface/lerobot/blob/main/.github/ISSUE_TEMPLATE/bug-report.yml).
 - **Pull requests:** Rebase on `upstream/main`, use a descriptive branch (don't work on `main`), run `pre-commit` and tests locally, and follow the [PR template](https://github.com/huggingface/lerobot/blob/main/.github/PULL_REQUEST_TEMPLATE.md).

-One member of the LeRobot team will then review your contribution.
+> [!IMPORTANT]
+> Community Review Policy: To help scale our efforts and foster a collaborative environment, we ask contributors to review at least one other person's open PR before their own receives attention. This shared responsibility multiplies our review capacity and helps everyone's code get merged faster!
+
+Once you have submitted your PR and completed a peer review, a member of the LeRobot team will review your contribution.

 Thank you for contributing to LeRobot!
@@ -1 +0,0 @@
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
@@ -1,60 +0,0 @@
-# LeRobot LIBERO Training Benchmark
-
-Train and evaluate all LeRobot policies on [LIBERO](https://libero-project.github.io/) and publish results as a HuggingFace leaderboard dataset.
-
-## Policies
-
-| Policy         | Base Model           | GPUs | LR     | Chunk | Notes                                 |
-| -------------- | -------------------- | ---- | ------ | ----- | ------------------------------------- |
-| pi0            | lerobot/pi0_base     | 8    | 2.5e-5 | 30    | PaliGemma + Gemma flow matching       |
-| pi0_fast       | lerobot/pi0fast-base | 8    | 2.5e-5 | 30    | Requires tokenizer pre-training       |
-| pi05           | lerobot/pi05_base    | 8    | 2.5e-5 | 30    | Quantiles normalization               |
-| groot          | nvidia/GR00T-N1.5-3B | 8    | 1e-4   | 30    | bf16, diffusion head + projector only |
-| act            | From scratch         | 1    | 1e-5   | 30    | ResNet-18, lightweight                |
-| diffusion      | From scratch         | 1    | 1e-4   | 32\*  | U-Net, horizon must be divisible by 8 |
-| smolvla        | lerobot/smolvla_base | 8    | 1e-4   | 30    | SmolVLM2-500M                         |
-| xvla           | lerobot/xvla-widowx  | 4    | 1e-4   | 32\*  | Florence2 + CLIP                      |
-| multi_task_dit | From scratch         | 1    | 2e-5   | 32\*  | CLIP + DiT                            |
-
-\* These policies use `horizon` rather than `chunk_size`. Set to 32 (nearest valid value to 30).
-
-## Training spec
-
- **Steps**: 5,000 per policy
- **Batch size**: 32 per GPU (effective BS = 256 for multi-GPU)
- **Dataset**: `lerobot/libero` (libero_spatial)
- **Evaluation**: 20 episodes after training
- **LR**: each policy's default optimizer/scheduler preset
- **Results**: each SLURM job publishes its own row to the HF leaderboard dataset automatically
-
-## Quick start
-
-### 1. Generate SLURM scripts
-
-```bash
-python benchmarks/libero/run_benchmark.py \
-    --output_dir /scratch/lerobot-benchmark \
-    --hub_org lerobot
-```
-
-### 2. Submit jobs
-
-```bash
-# If using pi0_fast, submit tokenizer first:
-sbatch /scratch/lerobot-benchmark/slurm_scripts/00_tokenizer.sh
-# Wait, then submit pi0_fast
-
-# All other policies can run in parallel:
-for script in /scratch/lerobot-benchmark/slurm_scripts/[0-9][0-9]_*.sh; do
-    [[ "$script" == *pi0_fast* ]] && continue
-    sbatch "$script"
-done
-```
-
-Each job publishes its result to `lerobot/benchmark-libero` on the Hub when it finishes.
-
-## Prerequisites
-
- SLURM cluster with CUDA GPUs (A100 80GB recommended for VLM policies)
- `pip install lerobot[pi,smolvla,groot,xvla,multi_task_dit,libero] datasets`
- `huggingface-cli login`
@@ -1,606 +0,0 @@
-#!/usr/bin/env python
-"""Generate SLURM sbatch scripts for training all LeRobot policies on LIBERO.
-
-Each generated script trains one policy, evaluates it, and publishes its
-results row to a HuggingFace leaderboard dataset — no separate collection
-step needed.
-
-Usage:
-    # Generate scripts for all policies:
-    python benchmarks/libero/run_benchmark.py \\
-        --output_dir /scratch/lerobot-benchmark --hub_org lerobot
-
-    # Generate for a subset:
-    python benchmarks/libero/run_benchmark.py \\
-        --policies pi0 smolvla act \\
-        --output_dir /scratch/lerobot-benchmark --hub_org lerobot
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import subprocess
-import textwrap
-import uuid
-from dataclasses import dataclass, field
-from datetime import UTC, datetime
-from pathlib import Path
-
-# ──────────────────────────────────────────────────────────────────────
-# Policy benchmark configs
-# ──────────────────────────────────────────────────────────────────────
-
-
-@dataclass
-class PolicyBenchmarkConfig:
-    """Training configuration for a single policy on a benchmark."""
-
-    policy_type: str
-    policy_path: str | None = None
-    num_gpus: int = 1
-    chunk_size: int | None = None  # Set on policies that use chunk_size (not horizon)
-    extra_policy_args: dict[str, str] = field(default_factory=dict)
-    needs_tokenizer: bool = False
-    tokenizer_args: dict[str, str] = field(default_factory=dict)
-
-
-COMMON_TRAINING_ARGS: dict[str, str] = {
-    "dataset.repo_id": "lerobot/libero",
-    "dataset.use_imagenet_stats": "false",
-    "env.type": "libero",
-    "env.task": "libero_spatial",
-    "steps": "5000",
-    "batch_size": "32",
-    "eval_freq": "0",
-    "save_freq": "5000",
-    "save_checkpoint": "true",
-    "log_freq": "100",
-    "wandb.enable": "true",
-    "policy.push_to_hub": "true",
-    "rename_map": (
-        '{"observation.images.image":"observation.images.camera1",'
-        '"observation.images.image2":"observation.images.camera2"}'
-    ),
-}
-
-EVAL_ARGS: dict[str, str] = {
-    "env.type": "libero",
-    "env.task": "libero_spatial",
-    "eval.n_episodes": "20",
-    "eval.batch_size": "10",
-}
-
-POLICY_CONFIGS: dict[str, PolicyBenchmarkConfig] = {
-    "pi0": PolicyBenchmarkConfig(
-        policy_type="pi0",
-        policy_path="lerobot/pi0_base",
-        num_gpus=8,
-        chunk_size=30,
-        extra_policy_args={
-            "policy.n_action_steps": "30",
-            "policy.scheduler_decay_steps": "5000",
-        },
-    ),
-    "pi0_fast": PolicyBenchmarkConfig(
-        policy_type="pi0_fast",
-        policy_path="lerobot/pi0fast-base",
-        num_gpus=8,
-        chunk_size=30,
-        extra_policy_args={
-            "policy.n_action_steps": "30",
-            "policy.scheduler_decay_steps": "5000",
-        },
-        needs_tokenizer=True,
-        tokenizer_args={
-            "repo_id": "lerobot/libero",
-            "action_horizon": "30",
-            "encoded_dims": "0:7",
-            "normalization_mode": "QUANTILES",
-            "vocab_size": "1024",
-            "scale": "10.0",
-            "push_to_hub": "true",
-        },
-    ),
-    "pi05": PolicyBenchmarkConfig(
-        policy_type="pi05",
-        policy_path="lerobot/pi05_base",
-        num_gpus=8,
-        chunk_size=30,
-        extra_policy_args={
-            "policy.n_action_steps": "30",
-            "policy.scheduler_decay_steps": "5000",
-        },
-    ),
-    "groot": PolicyBenchmarkConfig(
-        policy_type="groot",
-        policy_path=None,
-        num_gpus=8,
-        chunk_size=30,
-        extra_policy_args={
-            "policy.n_action_steps": "30",
-            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
-            "policy.tune_diffusion_model": "true",
-            "policy.tune_projector": "true",
-            "policy.tune_llm": "false",
-            "policy.tune_visual": "false",
-            "policy.use_bf16": "true",
-        },
-    ),
-    "act": PolicyBenchmarkConfig(
-        policy_type="act",
-        policy_path=None,
-        num_gpus=1,
-        chunk_size=30,
-        extra_policy_args={"policy.n_action_steps": "30"},
-    ),
-    "diffusion": PolicyBenchmarkConfig(
-        policy_type="diffusion",
-        policy_path=None,
-        num_gpus=1,
-        chunk_size=None,
-        extra_policy_args={
-            "policy.horizon": "32",
-            "policy.n_action_steps": "30",
-            "policy.n_obs_steps": "2",
-        },
-    ),
-    "smolvla": PolicyBenchmarkConfig(
-        policy_type="smolvla",
-        policy_path="lerobot/smolvla_base",
-        num_gpus=8,
-        chunk_size=30,
-        extra_policy_args={
-            "policy.n_action_steps": "30",
-            "policy.load_vlm_weights": "true",
-            "policy.freeze_vision_encoder": "false",
-            "policy.train_expert_only": "false",
-            "policy.scheduler_decay_steps": "5000",
-        },
-    ),
-    "xvla": PolicyBenchmarkConfig(
-        policy_type="xvla",
-        policy_path="lerobot/xvla-widowx",
-        num_gpus=4,
-        chunk_size=32,
-        extra_policy_args={
-            "policy.n_action_steps": "32",
-            "policy.scheduler_decay_steps": "5000",
-        },
-    ),
-    "multi_task_dit": PolicyBenchmarkConfig(
-        policy_type="multi_task_dit",
-        policy_path=None,
-        num_gpus=1,
-        chunk_size=None,
-        extra_policy_args={
-            "policy.horizon": "32",
-            "policy.n_action_steps": "30",
-        },
-    ),
-}
-
-ALL_POLICY_NAMES = list(POLICY_CONFIGS.keys())
-
-# GPU memory estimates (GB) for SLURM --mem allocation
-GPU_MEM_ESTIMATES: dict[str, int] = {
-    "pi0": 320,
-    "pi0_fast": 320,
-    "pi05": 280,
-    "groot": 320,
-    "act": 64,
-    "diffusion": 64,
-    "smolvla": 160,
-    "xvla": 160,
-    "multi_task_dit": 64,
-}
-
-
-# ──────────────────────────────────────────────────────────────────────
-# SLURM script generation
-# ──────────────────────────────────────────────────────────────────────
-
-
-def _cli_args(args: dict[str, str]) -> str:
-    """Build a backslash-continued CLI arg string with proper shell quoting."""
-    lines = []
-    for key, value in args.items():
-        if any(c in str(value) for c in ["{", "}", " ", '"', "'"]):
-            lines.append(f"    --{key}='{value}'")
-        else:
-            lines.append(f"    --{key}={value}")
-    return " \\\n".join(lines)
-
-
-def _training_cli_args(
-    policy_name: str,
-    output_dir: Path,
-    hub_org: str,
-    benchmark_uuid: str,
-) -> str:
-    cfg = POLICY_CONFIGS[policy_name]
-    args: dict[str, str] = {}
-    args.update(COMMON_TRAINING_ARGS)
-    args["policy.type"] = cfg.policy_type
-    if cfg.policy_path:
-        args["policy.path"] = cfg.policy_path
-    if cfg.chunk_size is not None:
-        args["policy.chunk_size"] = str(cfg.chunk_size)
-    args.update(cfg.extra_policy_args)
-    args["output_dir"] = str(output_dir / "train" / policy_name)
-    args["policy.repo_id"] = f"{hub_org}/{policy_name}_libero"
-    args["wandb.project"] = "lerobot-libero-benchmark"
-    args["wandb.run_name"] = f"{policy_name}_{benchmark_uuid[:8]}"
-    return _cli_args(args)
-
-
-def _publish_snippet(
-    policy_name: str,
-    output_dir: Path,
-    hub_org: str,
-    benchmark_uuid: str,
-    hub_dataset: str,
-) -> str:
-    """Inline Python that each SLURM job runs to publish its own result row."""
-    cfg = POLICY_CONFIGS[policy_name]
-    steps = int(COMMON_TRAINING_ARGS["steps"])
-    bs = int(COMMON_TRAINING_ARGS["batch_size"])
-    eff_bs = bs * cfg.num_gpus
-    train_dir = output_dir / "train" / policy_name
-
-    return textwrap.dedent(f"""\
-        python3 -c "
-        import json, os, re, sys
-        from pathlib import Path
-        from datetime import datetime, timezone
-
-        timing = {{}}
-        tp = Path('{output_dir}/logs/{policy_name}_timing.txt')
-        if tp.exists():
-            for ln in tp.read_text().splitlines():
-                if '=' in ln:
-                    k, _, v = ln.partition('=')
-                    timing[k.strip()] = v.strip()
-
-        # Parse eval results
-        eval_sr, eval_per_task, eval_n = None, '{{}}', 0
-        eval_dir = Path('{train_dir}/eval_results')
-        if eval_dir.exists():
-            for jf in eval_dir.glob('**/*.json'):
-                try:
-                    d = json.loads(jf.read_text())
-                except Exception:
-                    continue
-                if 'avg_success_rate' in d:
-                    eval_sr = d['avg_success_rate']
-                elif 'eval_info' in d and 'avg_success_rate' in d.get('eval_info', {{}}):
-                    eval_sr = d['eval_info']['avg_success_rate']
-                pt = {{k: v for k, v in d.items() if 'success_rate' in k and k != 'avg_success_rate'}}
-                if pt:
-                    eval_per_task = json.dumps(pt)
-                if 'n_episodes' in d:
-                    eval_n = d['n_episodes']
-
-        # Parse final loss from SLURM stdout
-        final_loss = None
-        for lf in sorted(Path('{output_dir}/logs').glob('{policy_name}_*.out'), reverse=True):
-            losses = re.findall(r'\\\"loss\\\"\\s*:\\s*([\\d.e+-]+)', lf.read_text())
-            if losses:
-                final_loss = float(losses[-1])
-                break
-
-        # Parse peak GPU mem
-        peak_mem = 0.0
-        csv_p = Path('{output_dir}/logs/{policy_name}_gpu_mem.csv')
-        if csv_p.exists():
-            for ln in csv_p.read_text().splitlines():
-                parts = ln.strip().split(',')
-                if len(parts) >= 2:
-                    try:
-                        peak_mem = max(peak_mem, float(parts[1].strip()))
-                    except ValueError:
-                        pass
-
-        # Parse train config for optimizer details
-        lr, opt_wd, sched_type, sched_warmup, sched_decay = 0.0, 0.0, '', 0, 0
-        freeze_ve, train_eo, grad_ckpt = False, False, False
-        cfg_path = Path('{train_dir}/checkpoints/{steps:06d}/pretrained_model/train_config.json')
-        if cfg_path.exists():
-            tc = json.loads(cfg_path.read_text())
-            o = tc.get('optimizer', {{}})
-            lr = o.get('lr', 0.0)
-            opt_wd = o.get('weight_decay', 0.0)
-            s = tc.get('scheduler', {{}})
-            sched_type = s.get('type', '')
-            sched_warmup = s.get('num_warmup_steps', 0)
-            sched_decay = s.get('num_decay_steps', 0)
-            p = tc.get('policy', {{}})
-            freeze_ve = p.get('freeze_vision_encoder', False)
-            train_eo = p.get('train_expert_only', False)
-            grad_ckpt = p.get('gradient_checkpointing', False)
-
-        row = {{
-            'benchmark_uuid': '{benchmark_uuid}',
-            'policy_type': '{policy_name}',
-            'policy_repo_id': '{hub_org}/{policy_name}_libero',
-            'base_model_repo_id': '{cfg.policy_path or ""}',
-            'dataset_repo_id': '{COMMON_TRAINING_ARGS["dataset.repo_id"]}',
-            'env_type': '{COMMON_TRAINING_ARGS["env.type"]}',
-            'env_task': '{COMMON_TRAINING_ARGS["env.task"]}',
-            'steps': {steps},
-            'batch_size_per_gpu': {bs},
-            'num_gpus': {cfg.num_gpus},
-            'effective_batch_size': {eff_bs},
-            'total_samples_seen': {steps * eff_bs},
-            'chunk_size': {cfg.chunk_size or 0},
-            'learning_rate': lr,
-            'optimizer_type': 'AdamW',
-            'optimizer_weight_decay': opt_wd,
-            'scheduler_type': sched_type,
-            'scheduler_warmup_steps': sched_warmup,
-            'scheduler_decay_steps': sched_decay,
-            'freeze_vision_encoder': freeze_ve,
-            'train_expert_only': train_eo,
-            'gradient_checkpointing': grad_ckpt,
-            'eval_success_rate': eval_sr,
-            'eval_success_rate_per_task': eval_per_task,
-            'eval_n_episodes': eval_n,
-            'final_train_loss': final_loss,
-            'training_time_s': float(timing.get('TRAINING_TIME_S', 0)),
-            'peak_gpu_memory_mb': peak_mem or float(timing.get('MAX_GPU_MEM_MB', 0)),
-            'gpu_type': timing.get('GPU_TYPE', 'unknown'),
-            'lerobot_commit': timing.get('LEROBOT_COMMIT', 'unknown'),
-            'timestamp': datetime.now(timezone.utc).isoformat(),
-        }}
-
-        # Save locally
-        Path('{train_dir}/benchmark_result.json').write_text(json.dumps(row, indent=2, default=str))
-
-        # Push to HF dataset
-        try:
-            from datasets import Dataset, load_dataset
-            try:
-                existing = load_dataset('{hub_dataset}', split='train')
-                rows = existing.to_list() + [row]
-            except Exception:
-                rows = [row]
-            Dataset.from_list(rows).push_to_hub('{hub_dataset}', split='train')
-            print('Published result to {hub_dataset}')
-        except ImportError:
-            print('datasets library not installed — result saved locally only')
-        except Exception as e:
-            print(f'Failed to push to hub: {{e}} — result saved locally')
-        "
-    """)
-
-
-def _generate_sbatch_script(
-    policy_name: str,
-    output_dir: Path,
-    hub_org: str,
-    benchmark_uuid: str,
-    hub_dataset: str,
-    lerobot_commit: str,
-) -> str:
-    cfg = POLICY_CONFIGS[policy_name]
-    steps = int(COMMON_TRAINING_ARGS["steps"])
-    log_dir = output_dir / "logs"
-    train_dir = output_dir / "train" / policy_name
-    checkpoint_path = train_dir / f"checkpoints/{steps:06d}/pretrained_model"
-
-    training_args = _training_cli_args(policy_name, output_dir, hub_org, benchmark_uuid)
-    eval_args = _cli_args(EVAL_ARGS)
-    publish = _publish_snippet(policy_name, output_dir, hub_org, benchmark_uuid, hub_dataset)
-
-    return textwrap.dedent(f"""\
-        #!/bin/bash
-        #SBATCH --job-name=bench_{policy_name}
-        #SBATCH --nodes=1
-        #SBATCH --ntasks-per-node=1
-        #SBATCH --gres=gpu:{cfg.num_gpus}
-        #SBATCH --cpus-per-task={cfg.num_gpus * 8}
-        #SBATCH --mem={GPU_MEM_ESTIMATES.get(policy_name, 128)}G
-        #SBATCH --time=06:00:00
-        #SBATCH --output={log_dir}/{policy_name}_%j.out
-        #SBATCH --error={log_dir}/{policy_name}_%j.err
-
-        set -euo pipefail
-
-        echo "=========================================="
-        echo "LeRobot LIBERO Benchmark — {policy_name}"
-        echo "UUID: {benchmark_uuid}"
-        echo "Start: $(date -Iseconds)"
-        echo "Host: $(hostname) | GPUs: {cfg.num_gpus}"
-        echo "=========================================="
-
-        START_TIME=$(date +%s)
-
-        # GPU memory monitoring (every 30s)
-        nvidia-smi --query-gpu=index,memory.used,memory.total,gpu_name \\
-            --format=csv,noheader,nounits -l 30 \\
-            > "{log_dir}/{policy_name}_gpu_mem.csv" &
-        GPU_MONITOR_PID=$!
-
-        # ── Training ──────────────────────────────────────────────────
-        echo "[$(date -Iseconds)] Starting training..."
-        accelerate launch --num_processes={cfg.num_gpus} \\
-            $(which lerobot-train) \\
-        {training_args}
-        TRAIN_EXIT=$?
-        TRAIN_END=$(date +%s)
-        echo "[$(date -Iseconds)] Training exit code: $TRAIN_EXIT"
-
-        # ── Evaluation ────────────────────────────────────────────────
-        EVAL_EXIT=1
-        if [ $TRAIN_EXIT -eq 0 ]; then
-            echo "[$(date -Iseconds)] Starting evaluation..."
-            lerobot-eval \\
-                --policy.path="{checkpoint_path}" \\
-            {eval_args} \\
-                --output_dir="{train_dir}/eval_results"
-            EVAL_EXIT=$?
-            echo "[$(date -Iseconds)] Eval exit code: $EVAL_EXIT"
-        else
-            echo "[$(date -Iseconds)] Skipping eval — training failed."
-        fi
-
-        # ── Timing ────────────────────────────────────────────────────
-        END_TIME=$(date +%s)
-        kill $GPU_MONITOR_PID 2>/dev/null || true
-
-        cat > "{log_dir}/{policy_name}_timing.txt" <<TIMING_EOF
-        BENCHMARK_UUID={benchmark_uuid}
-        POLICY_TYPE={policy_name}
-        TRAINING_TIME_S=$((TRAIN_END - START_TIME))
-        TOTAL_TIME_S=$((END_TIME - START_TIME))
-        TRAIN_EXIT=$TRAIN_EXIT
-        EVAL_EXIT=$EVAL_EXIT
-        MAX_GPU_MEM_MB=$(awk -F',' '{{print $2}}' "{log_dir}/{policy_name}_gpu_mem.csv" 2>/dev/null | sort -n | tail -1)
-        GPU_TYPE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | head -1 | xargs)
-        LEROBOT_COMMIT={lerobot_commit}
-        TIMING_EOF
-
-        # ── Publish result to HF dataset ──────────────────────────────
-        echo "[$(date -Iseconds)] Publishing result..."
-        {publish}
-
-        echo "=========================================="
-        echo "Done: $(date -Iseconds)"
-        echo "Training: $((TRAIN_END - START_TIME))s | Total: $((END_TIME - START_TIME))s"
-        echo "=========================================="
-    """)
-
-
-def _generate_tokenizer_script(
-    output_dir: Path,
-    hub_org: str,
-    benchmark_uuid: str,
-) -> str:
-    cfg = POLICY_CONFIGS["pi0_fast"]
-    log_dir = output_dir / "logs"
-    tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
-
-    tok_args = dict(cfg.tokenizer_args)
-    tok_args["hub_repo_id"] = tokenizer_hub_repo
-
-    return textwrap.dedent(f"""\
-        #!/bin/bash
-        #SBATCH --job-name=bench_tokenizer
-        #SBATCH --nodes=1
-        #SBATCH --ntasks-per-node=1
-        #SBATCH --gres=gpu:1
-        #SBATCH --cpus-per-task=8
-        #SBATCH --mem=64G
-        #SBATCH --time=01:00:00
-        #SBATCH --output={log_dir}/tokenizer_%j.out
-        #SBATCH --error={log_dir}/tokenizer_%j.err
-
-        set -euo pipefail
-        echo "LeRobot — FAST Tokenizer | UUID: {benchmark_uuid}"
-
-        lerobot-train-tokenizer \\
-        {_cli_args(tok_args)}
-
-        echo "Tokenizer pushed to: {tokenizer_hub_repo}"
-    """)
-
-
-# ──────────────────────────────────────────────────────────────────────
-# Main
-# ──────────────────────────────────────────────────────────────────────
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Generate SLURM scripts for LeRobot LIBERO benchmark.")
-    parser.add_argument(
-        "--policies",
-        nargs="+",
-        default=ALL_POLICY_NAMES,
-        choices=ALL_POLICY_NAMES,
-        help="Policies to benchmark (default: all).",
-    )
-    parser.add_argument("--output_dir", type=Path, required=True, help="Root output directory.")
-    parser.add_argument("--hub_org", type=str, default="lerobot", help="HuggingFace org.")
-    parser.add_argument("--hub_dataset", type=str, default=None, help="HF dataset repo for results.")
-    parser.add_argument("--uuid", type=str, default=None, help="Override benchmark UUID.")
-    args = parser.parse_args()
-
-    benchmark_uuid = args.uuid or str(uuid.uuid4())
-    output_dir: Path = args.output_dir.resolve()
-    policies: list[str] = args.policies
-    hub_org: str = args.hub_org
-    hub_dataset: str = args.hub_dataset or f"{hub_org}/benchmark-libero"
-
-    try:
-        commit = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        commit = "unknown"
-
-    scripts_dir = output_dir / "slurm_scripts"
-    log_dir = output_dir / "logs"
-    scripts_dir.mkdir(parents=True, exist_ok=True)
-    log_dir.mkdir(parents=True, exist_ok=True)
-    for p in policies:
-        (output_dir / "train" / p).mkdir(parents=True, exist_ok=True)
-
-    generated: dict[str, Path] = {}
-
-    # Tokenizer job for pi0_fast
-    tokenizer_path = None
-    if "pi0_fast" in policies:
-        script = _generate_tokenizer_script(output_dir, hub_org, benchmark_uuid)
-        tokenizer_path = scripts_dir / "00_tokenizer.sh"
-        tokenizer_path.write_text(script)
-        tokenizer_path.chmod(0o755)
-        generated["tokenizer"] = tokenizer_path
-        tokenizer_hub_repo = f"{hub_org}/fast-tokenizer-libero"
-        POLICY_CONFIGS["pi0_fast"].extra_policy_args["policy.action_tokenizer_name"] = tokenizer_hub_repo
-
-    # Per-policy scripts
-    for i, name in enumerate(sorted(policies), start=1):
-        script = _generate_sbatch_script(name, output_dir, hub_org, benchmark_uuid, hub_dataset, commit)
-        path = scripts_dir / f"{i:02d}_{name}.sh"
-        path.write_text(script)
-        path.chmod(0o755)
-        generated[name] = path
-
-    # Manifest
-    manifest = {
-        "benchmark_uuid": benchmark_uuid,
-        "timestamp": datetime.now(UTC).isoformat(),
-        "lerobot_commit": commit,
-        "hub_org": hub_org,
-        "hub_dataset": hub_dataset,
-        "policies": policies,
-        "output_dir": str(output_dir),
-        "scripts": {k: str(v) for k, v in generated.items()},
-    }
-    manifest_path = output_dir / "benchmark_manifest.json"
-    manifest_path.write_text(json.dumps(manifest, indent=2))
-
-    # Instructions
-    print("=" * 60)
-    print("LeRobot LIBERO Benchmark — Scripts Generated")
-    print(f"UUID: {benchmark_uuid}")
-    print(f"Output: {output_dir}")
-    print(f"Results dataset: {hub_dataset}")
-    print("=" * 60)
-    print()
-    for _name, path in sorted(generated.items()):
-        print(f"  {path}")
-    print()
-
-    if tokenizer_path:
-        print("IMPORTANT: pi0_fast requires tokenizer training FIRST.")
-        print(f"  1. sbatch {tokenizer_path}")
-        print("  2. Wait for completion")
-        print(f"  3. sbatch {generated.get('pi0_fast', 'N/A')}")
-        print("  4. All other policies can run in parallel")
-    else:
-        print("All scripts can be submitted in parallel.")
-    print()
-    print("Each job publishes its result to the HF dataset automatically.")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,156 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
-
-from __future__ import annotations
-
-import argparse
-import json
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any
-
-from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
-
-
-def load_json_if_exists(path: Path) -> dict[str, Any] | None:
-    if not path.exists():
-        return None
-    return json.loads(path.read_text())
-
-
-def find_latest_train_config_path(run_root: Path) -> Path | None:
-    checkpoints_dir = run_root / "train" / "checkpoints"
-    if not checkpoints_dir.exists():
-        return None
-    candidates = sorted(
-        checkpoints_dir.glob("*/pretrained_model/train_config.json"),
-        key=lambda path: path.parts[-3],
-    )
-    return candidates[-1] if candidates else None
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--benchmark", required=True)
-    parser.add_argument("--policy", required=True)
-    parser.add_argument("--run_root", required=True, type=Path)
-    parser.add_argument("--results_repo", required=True)
-    parser.add_argument("--git_commit", required=True)
-    parser.add_argument("--num_gpus", required=True, type=int)
-    parser.add_argument("--microbatch_per_gpu", required=True, type=int)
-    parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
-    parser.add_argument("--effective_batch_size", required=True, type=int)
-    parser.add_argument("--train_wall_time_s", required=True, type=float)
-    parser.add_argument("--eval_wall_time_s", required=True, type=float)
-    parser.add_argument("--slurm_job_id", default="")
-    parser.add_argument("--docker_image", required=True)
-    return parser.parse_args()
-
-
-def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
-    now = datetime.now(UTC)
-    created_at = now.isoformat()
-    timestamp = utc_timestamp_slug(now)
-    run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
-    eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
-    train_config_path = find_latest_train_config_path(args.run_root)
-    train_config = load_json_if_exists(train_config_path) or {}
-
-    artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
-    row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
-
-    row = {
-        "schema_version": 1,
-        "created_at": created_at,
-        "run_id": run_id,
-        "benchmark": args.benchmark,
-        "policy": args.policy,
-        "git_commit": args.git_commit,
-        "slurm_job_id": args.slurm_job_id or None,
-        "docker_image": args.docker_image,
-        "resources": {
-            "num_gpus": args.num_gpus,
-            "microbatch_per_gpu": args.microbatch_per_gpu,
-            "gradient_accumulation_steps": args.gradient_accumulation_steps,
-            "effective_batch_size": args.effective_batch_size,
-        },
-        "timings": {
-            "train_wall_time_s": args.train_wall_time_s,
-            "eval_wall_time_s": args.eval_wall_time_s,
-            "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
-        },
-        "eval": {
-            "overall": eval_info.get("overall", {}),
-            "per_group": eval_info.get("per_group", {}),
-            "per_task_count": len(eval_info.get("per_task", [])),
-        },
-        "paths": {
-            "run_root": str(args.run_root),
-            "train_dir": str(args.run_root / "train"),
-            "eval_dir": str(args.run_root / "eval"),
-        },
-        "train_config": train_config,
-        "artifact_urls": {
-            "row": make_hub_file_url(args.results_repo, row_path_in_repo),
-        },
-    }
-
-    row_path = args.run_root / "benchmark_row.json"
-    row_path.parent.mkdir(parents=True, exist_ok=True)
-    upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
-
-    eval_info_path = args.run_root / "eval" / "eval_info.json"
-    if eval_info_path.exists():
-        row["artifact_urls"]["eval_info"] = make_hub_file_url(
-            args.results_repo, f"{artifact_prefix}/eval_info.json"
-        )
-        upload_list.append(
-            UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
-        )
-
-    if train_config_path is not None and train_config_path.exists():
-        row["artifact_urls"]["train_config"] = make_hub_file_url(
-            args.results_repo, f"{artifact_prefix}/train_config.json"
-        )
-        upload_list.append(
-            UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
-        )
-
-    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
-    return row, upload_list
-
-
-def main() -> int:
-    args = parse_args()
-    row, upload_list = build_row(args)
-    uploaded = upload_targets(
-        repo_id=args.results_repo,
-        targets=upload_list,
-        repo_type="dataset",
-        private=False,
-        commit_message=f"Add benchmark row {row['run_id']}",
-    )
-    row["uploaded_paths"] = uploaded
-    row_path = args.run_root / "benchmark_row.json"
-    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
-    print(json.dumps(row, indent=2, sort_keys=True))
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
@@ -1,647 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
-
-from __future__ import annotations
-
-import argparse
-import json
-import math
-import subprocess
-from dataclasses import asdict, dataclass, field
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any
-
-from lerobot.utils.history_repo import utc_timestamp_slug
-
-MAX_GPUS = 8
-MIN_GPUS = 1
-DEFAULT_STEPS = 20_000
-DEFAULT_EFFECTIVE_BATCH_SIZE = 256
-DEFAULT_MICROBATCH_PER_GPU = 32
-DEFAULT_EVAL_BATCH_SIZE = 1
-DEFAULT_CPUS_PER_GPU = 8
-DEFAULT_MEMORY_PER_GPU_GB = 40
-
-
-@dataclass(frozen=True)
-class BenchmarkSpec:
-    name: str
-    dataset_repo_id: str
-    docker_image: str
-    eval_env_type: str
-    eval_task: str
-    eval_n_episodes: int
-    train_steps: int = DEFAULT_STEPS
-    effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
-    train_extra_args: dict[str, Any] = field(default_factory=dict)
-    eval_extra_args: dict[str, Any] = field(default_factory=dict)
-
-
-@dataclass(frozen=True)
-class PolicySpec:
-    name: str
-    policy_type: str
-    num_gpus: int
-    policy_path: str | None = None
-    microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
-    extra_train_args: dict[str, Any] = field(default_factory=dict)
-    extra_eval_args: dict[str, Any] = field(default_factory=dict)
-    needs_tokenizer: bool = False
-    tokenizer_args: dict[str, Any] = field(default_factory=dict)
-
-
-@dataclass(frozen=True)
-class PlannedJob:
-    benchmark: str
-    policy: str
-    run_rel: str
-    num_gpus: int
-    microbatch_per_gpu: int
-    gradient_accumulation_steps: int
-    effective_batch_size: int
-    docker_image: str
-    train_args: dict[str, Any]
-    eval_args: dict[str, Any]
-    tokenizer_args: dict[str, Any] | None
-    script_path: str
-
-
-BENCHMARKS: dict[str, BenchmarkSpec] = {
-    "libero_plus": BenchmarkSpec(
-        name="libero_plus",
-        dataset_repo_id="lerobot/libero_plus",
-        docker_image="lerobot-benchmark-libero-plus:latest",
-        eval_env_type="libero_plus",
-        eval_task="libero_spatial,libero_object,libero_goal,libero_10",
-        eval_n_episodes=10,
-        train_extra_args={
-            "rename_map": {
-                "observation.images.image": "observation.images.camera1",
-                "observation.images.image2": "observation.images.camera2",
-            },
-        },
-        eval_extra_args={
-            "env.camera_name_mapping": {
-                "agentview_image": "camera1",
-                "robot0_eye_in_hand_image": "camera2",
-            },
-            "env.max_parallel_tasks": 1,
-            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
-            "eval.use_async_envs": False,
-            "eval.max_episodes_rendered": 0,
-            "policy.device": "cuda",
-        },
-    ),
-    "robomme": BenchmarkSpec(
-        name="robomme",
-        dataset_repo_id="lerobot/robomme",
-        docker_image="lerobot-benchmark-robomme:latest",
-        eval_env_type="robomme",
-        eval_task=(
-            "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
-            "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
-            "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
-        ),
-        eval_n_episodes=50,
-        train_extra_args={
-            "rename_map": {
-                "observation.images.image": "observation.images.camera1",
-                "observation.images.wrist_image": "observation.images.camera2",
-            },
-        },
-        eval_extra_args={
-            "env.dataset_split": "test",
-            "env.max_parallel_tasks": 1,
-            "rename_map": {
-                "observation.images.image": "observation.images.camera1",
-                "observation.images.wrist_image": "observation.images.camera2",
-            },
-            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
-            "eval.use_async_envs": False,
-            "eval.max_episodes_rendered": 0,
-            "policy.device": "cuda",
-        },
-    ),
-}
-
-
-POLICIES: dict[str, PolicySpec] = {
-    "pi0": PolicySpec(
-        name="pi0",
-        policy_type="pi0",
-        policy_path="lerobot/pi0_base",
-        num_gpus=8,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-            "policy.scheduler_decay_steps": DEFAULT_STEPS,
-            "policy.empty_cameras": 0,
-        },
-    ),
-    "pi0_fast": PolicySpec(
-        name="pi0_fast",
-        policy_type="pi0_fast",
-        policy_path="lerobot/pi0fast-base",
-        num_gpus=8,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-            "policy.scheduler_decay_steps": DEFAULT_STEPS,
-            "policy.empty_cameras": 0,
-        },
-        needs_tokenizer=True,
-        tokenizer_args={
-            "action_horizon": 30,
-            "encoded_dims": "0:7",
-            "normalization_mode": "QUANTILES",
-            "vocab_size": 1024,
-            "scale": 10.0,
-            "push_to_hub": True,
-        },
-    ),
-    "pi05": PolicySpec(
-        name="pi05",
-        policy_type="pi05",
-        policy_path="lerobot/pi05_base",
-        num_gpus=8,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-            "policy.scheduler_decay_steps": DEFAULT_STEPS,
-            "policy.empty_cameras": 0,
-        },
-    ),
-    "groot": PolicySpec(
-        name="groot",
-        policy_type="groot",
-        num_gpus=8,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
-            "policy.tune_diffusion_model": True,
-            "policy.tune_projector": True,
-            "policy.tune_llm": False,
-            "policy.tune_visual": False,
-            "policy.use_bf16": True,
-        },
-    ),
-    "act": PolicySpec(
-        name="act",
-        policy_type="act",
-        num_gpus=1,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-        },
-    ),
-    "diffusion": PolicySpec(
-        name="diffusion",
-        policy_type="diffusion",
-        num_gpus=1,
-        extra_train_args={
-            "policy.horizon": 32,
-            "policy.n_action_steps": 30,
-            "policy.n_obs_steps": 2,
-        },
-    ),
-    "smolvla": PolicySpec(
-        name="smolvla",
-        policy_type="smolvla",
-        policy_path="lerobot/smolvla_base",
-        num_gpus=8,
-        extra_train_args={
-            "policy.n_action_steps": 30,
-            "policy.load_vlm_weights": True,
-            "policy.freeze_vision_encoder": False,
-            "policy.train_expert_only": False,
-            "policy.scheduler_decay_steps": DEFAULT_STEPS,
-            "policy.empty_cameras": 1,
-        },
-    ),
-    "xvla": PolicySpec(
-        name="xvla",
-        policy_type="xvla",
-        policy_path="lerobot/xvla-widowx",
-        num_gpus=4,
-        extra_train_args={
-            "policy.n_action_steps": 32,
-            "policy.scheduler_decay_steps": DEFAULT_STEPS,
-            "policy.empty_cameras": 1,
-        },
-    ),
-    "multi_task_dit": PolicySpec(
-        name="multi_task_dit",
-        policy_type="multi_task_dit",
-        num_gpus=1,
-        extra_train_args={
-            "policy.horizon": 32,
-            "policy.n_action_steps": 30,
-        },
-    ),
-}
-
-
-def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
-    return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
-
-
-def get_requested_names(
-    requested: list[str] | None,
-    available: dict[str, Any],
-    *,
-    kind: str,
-) -> list[str]:
-    if not requested:
-        return list(available)
-    unknown = sorted(set(requested) - set(available))
-    if unknown:
-        raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
-    return requested
-
-
-def compute_gradient_accumulation_steps(
-    *,
-    effective_batch_size: int,
-    num_gpus: int,
-    microbatch_per_gpu: int,
-) -> int:
-    per_step_batch = num_gpus * microbatch_per_gpu
-    if effective_batch_size % per_step_batch != 0:
-        raise ValueError(
-            f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
-            f"{microbatch_per_gpu=}."
-        )
-    return effective_batch_size // per_step_batch
-
-
-def make_run_slug() -> str:
-    return utc_timestamp_slug()
-
-
-def shell_value(value: Any) -> str:
-    if isinstance(value, bool):
-        value = "true" if value else "false"
-    elif isinstance(value, (dict, list)):
-        value = json.dumps(value, sort_keys=True)
-    else:
-        value = str(value)
-    escaped = (
-        value.replace("\\", "\\\\")
-        .replace('"', '\\"')
-        .replace("$", "\\$")
-        .replace("`", "\\`")
-    )
-    return f'"{escaped}"'
-
-
-def format_cli_args(args: dict[str, Any]) -> str:
-    lines = []
-    for key, value in args.items():
-        lines.append(f"  --{key}={shell_value(value)}")
-    return " \\\n".join(lines)
-
-
-def build_train_args(
-    *,
-    benchmark: BenchmarkSpec,
-    policy: PolicySpec,
-    train_dir: str,
-    gradient_accumulation_steps: int,
-) -> dict[str, Any]:
-    args: dict[str, Any] = {
-        "dataset.repo_id": benchmark.dataset_repo_id,
-        "output_dir": train_dir,
-        "steps": benchmark.train_steps,
-        "batch_size": policy.microbatch_per_gpu,
-        "gradient_accumulation_steps": gradient_accumulation_steps,
-        "eval_freq": 0,
-        "save_freq": benchmark.train_steps,
-        "save_checkpoint": True,
-        "log_freq": 100,
-        "wandb.enable": False,
-        "policy.push_to_hub": False,
-        "policy.device": "cuda",
-    }
-    if policy.policy_path:
-        args["policy.path"] = policy.policy_path
-    else:
-        args["policy.type"] = policy.policy_type
-    args.update(benchmark.train_extra_args)
-    args.update(policy.extra_train_args)
-    return args
-
-
-def build_eval_args(
-    *,
-    benchmark: BenchmarkSpec,
-    policy: PolicySpec,
-    checkpoint_path: str,
-    eval_dir: str,
-) -> dict[str, Any]:
-    args: dict[str, Any] = {
-        "policy.path": checkpoint_path,
-        "env.type": benchmark.eval_env_type,
-        "env.task": benchmark.eval_task,
-        "eval.n_episodes": benchmark.eval_n_episodes,
-        "output_dir": eval_dir,
-    }
-    args.update(benchmark.eval_extra_args)
-    args.update(policy.extra_eval_args)
-    return args
-
-
-def plan_jobs(
-    *,
-    output_dir: Path,
-    hub_org: str,
-    results_repo: str,
-    policies: list[str],
-    benchmarks: list[str],
-) -> list[PlannedJob]:
-    _ = hub_org
-    _ = results_repo
-    scripts_dir = output_dir / "slurm"
-    jobs: list[PlannedJob] = []
-    for benchmark_name in benchmarks:
-        benchmark = BENCHMARKS[benchmark_name]
-        for policy_name in policies:
-            policy = POLICIES[policy_name]
-            num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
-            run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
-            run_root = f"/benchmark-output/{run_rel}"
-            gradient_accumulation_steps = compute_gradient_accumulation_steps(
-                effective_batch_size=benchmark.effective_batch_size,
-                num_gpus=num_gpus,
-                microbatch_per_gpu=policy.microbatch_per_gpu,
-            )
-            train_dir = f"{run_root}/train"
-            checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
-            eval_dir = f"{run_root}/eval"
-            train_args = build_train_args(
-                benchmark=benchmark,
-                policy=policy,
-                train_dir=train_dir,
-                gradient_accumulation_steps=gradient_accumulation_steps,
-            )
-            eval_args = build_eval_args(
-                benchmark=benchmark,
-                policy=policy,
-                checkpoint_path=checkpoint_path,
-                eval_dir=eval_dir,
-            )
-            tokenizer_args = None
-            if policy.needs_tokenizer:
-                tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
-                tokenizer_args = {
-                    "repo_id": benchmark.dataset_repo_id,
-                    "output_dir": f"{run_root}/tokenizer",
-                    "hub_repo_id": tokenizer_repo_id,
-                    **policy.tokenizer_args,
-                }
-                train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
-            script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
-            jobs.append(
-                PlannedJob(
-                    benchmark=benchmark_name,
-                    policy=policy_name,
-                    run_rel=run_rel,
-                    num_gpus=num_gpus,
-                    microbatch_per_gpu=policy.microbatch_per_gpu,
-                    gradient_accumulation_steps=gradient_accumulation_steps,
-                    effective_batch_size=benchmark.effective_batch_size,
-                    docker_image=benchmark.docker_image,
-                    train_args=train_args,
-                    eval_args=eval_args,
-                    tokenizer_args=tokenizer_args,
-                    script_path=script_path,
-                )
-            )
-    return jobs
-
-
-def render_sbatch_script(
-    *,
-    job: PlannedJob,
-    output_dir: Path,
-    results_repo_id: str,
-    git_commit: str,
-) -> str:
-    host_output_dir = output_dir.resolve()
-    run_root = f"/benchmark-output/{job.run_rel}"
-    host_run_root = host_output_dir / job.run_rel
-    cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
-    mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
-    gpu_ids_expr = "${GPU_IDS}"
-    train_cli = format_cli_args(job.train_args)
-    eval_cli = format_cli_args(job.eval_args)
-    tokenizer_command = ""
-    if job.tokenizer_args:
-        tokenizer_cli = format_cli_args(job.tokenizer_args)
-        tokenizer_command = f"""
-docker run --rm --gpus all \\
-  --shm-size=16g \\
-  -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
-  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_HOME=/tmp/hf \\
-  -v "{host_output_dir}:/benchmark-output" \\
-  -w /lerobot \\
-  "{job.docker_image}" \\
-  bash -lc '
-    set -euo pipefail
-    if [[ -n "${{HF_TOKEN:-}}" ]]; then
-      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
-    fi
-    lerobot-train-tokenizer \\
-{tokenizer_cli}
-  '
-"""
-    return f"""#!/bin/bash
-#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
-#SBATCH --gres=gpu:{job.num_gpus}
-#SBATCH --cpus-per-task={cpus_per_task}
-#SBATCH --mem={mem_gb}G
-#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
-#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
-
-set -euo pipefail
-
-HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
-GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
-RUN_ROOT="{run_root}"
-
-mkdir -p "{host_output_dir}/logs"
-mkdir -p "{host_run_root.parent}"
-
-{tokenizer_command}
-
-TRAIN_START="$(date +%s)"
-docker run --rm --gpus all \\
-  --shm-size=16g \\
-  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_HOME=/tmp/hf \\
-  -v "{host_output_dir}:/benchmark-output" \\
-  -w /lerobot \\
-  "{job.docker_image}" \\
-  bash -lc '
-    set -euo pipefail
-    if [[ -n "${{HF_TOKEN:-}}" ]]; then
-      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
-    fi
-    accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
-{train_cli}
-  '
-TRAIN_END="$(date +%s)"
-
-EVAL_START="$(date +%s)"
-docker run --rm --gpus all \\
-  --shm-size=16g \\
-  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_HOME=/tmp/hf \\
-  -v "{host_output_dir}:/benchmark-output" \\
-  -w /lerobot \\
-  "{job.docker_image}" \\
-  bash -lc '
-    set -euo pipefail
-    if [[ -n "${{HF_TOKEN:-}}" ]]; then
-      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
-    fi
-    lerobot-eval \\
-{eval_cli}
-  '
-EVAL_END="$(date +%s)"
-TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
-EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
-
-docker run --rm --gpus all \\
-  --shm-size=16g \\
-  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
-  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
-  -e HF_HOME=/tmp/hf \\
-  -e RUN_ROOT="${{RUN_ROOT}}" \\
-  -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
-  -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
-  -v "{host_output_dir}:/benchmark-output" \\
-  -w /lerobot \\
-  "{job.docker_image}" \\
-  bash -lc '
-    set -euo pipefail
-    if [[ -n "${{HF_TOKEN:-}}" ]]; then
-      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
-    fi
-    uv run python benchmarks/publish_benchmark_result.py \\
-      --benchmark={job.benchmark} \\
-      --policy={job.policy} \\
-      --run_root="${{RUN_ROOT}}" \\
-      --results_repo={results_repo_id} \\
-      --git_commit={git_commit} \\
-      --num_gpus={job.num_gpus} \\
-      --microbatch_per_gpu={job.microbatch_per_gpu} \\
-      --gradient_accumulation_steps={job.gradient_accumulation_steps} \\
-      --effective_batch_size={job.effective_batch_size} \\
-      --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
-      --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
-      --slurm_job_id="${{SLURM_JOB_ID:-}}" \\
-      --docker_image={job.docker_image}
-  '
-"""
-
-
-def write_manifest(
-    *,
-    output_dir: Path,
-    jobs: list[PlannedJob],
-    git_commit: str,
-    hub_org: str,
-    results_repo: str,
-) -> Path:
-    manifest = {
-        "generated_at": datetime.now(UTC).isoformat(),
-        "git_commit": git_commit,
-        "hub_org": hub_org,
-        "results_repo": results_repo,
-        "jobs": [asdict(job) for job in jobs],
-    }
-    manifest_path = output_dir / "manifest.json"
-    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
-    return manifest_path
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--policies", nargs="*", default=None)
-    parser.add_argument("--benchmarks", nargs="*", default=None)
-    parser.add_argument("--output_dir", required=True, type=Path)
-    parser.add_argument("--hub_org", required=True)
-    parser.add_argument("--results_repo", required=True)
-    parser.add_argument("--submit", action="store_true")
-    return parser.parse_args()
-
-
-def get_git_commit() -> str:
-    return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
-
-
-def main() -> int:
-    args = parse_args()
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
-    (args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
-
-    selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
-    selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
-    git_commit = get_git_commit()
-    results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
-
-    jobs = plan_jobs(
-        output_dir=args.output_dir,
-        hub_org=args.hub_org,
-        results_repo=results_repo_id,
-        policies=selected_policies,
-        benchmarks=selected_benchmarks,
-    )
-
-    for job in jobs:
-        script = render_sbatch_script(
-            job=job,
-            output_dir=args.output_dir,
-            results_repo_id=results_repo_id,
-            git_commit=git_commit,
-        )
-        script_path = Path(job.script_path)
-        script_path.write_text(script)
-        script_path.chmod(0o755)
-        if args.submit:
-            subprocess.run(["sbatch", str(script_path)], check=True)
-
-    manifest_path = write_manifest(
-        output_dir=args.output_dir,
-        jobs=jobs,
-        git_commit=git_commit,
-        hub_org=args.hub_org,
-        results_repo=results_repo_id,
-    )
-    print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
-    print(f"Manifest: {manifest_path}")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-FROM huggingface/lerobot-gpu:latest
+# Benchmark image for LIBERO-plus integration tests.
+# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus
+# fork source + its 6.4 GB perturbation assets.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus .
+# Run:    docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ...

+FROM huggingface/lerobot-gpu:latest
+ENV MUJOCO_GL=egl
+
+# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras
+# (wand / ImageMagick / fontconfig) not in the nightly base.
 USER root
 RUN apt-get update \
    && apt-get install -y --no-install-recommends \
@@ -21,28 +31,54 @@ RUN apt-get update \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 USER user_lerobot

+# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in
+# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py.
+# We install these explicitly instead of via the [libero_plus] extra because
+# the extra's `libero @ git+...` dep installs as a namespace package and then
+# clone and PYTHONPATH-override it below.
 RUN uv pip install --no-cache \
-        "robosuite==1.4.1" bddl easydict mujoco matplotlib wand scikit-image gym
+        "robosuite==1.4.1" \
+        "bddl==1.0.1" \
+        "easydict==1.13" \
+        "mujoco==3.7.0" \
+        "matplotlib==3.10.8" \
+        "Wand==0.6.13" \
+        "scikit-image==0.25.2" \
+        "gym==0.26.2"

+# Clone LIBERO-plus and make it importable as `libero`. The nightly base has
+# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so
+# Python resolves `import libero` to the 2402-task LIBERO-plus module instead.
+# Pinned to the current upstream main SHA so benchmark builds stay reproducible.
+ARG LIBERO_PLUS_SHA=4976dc3
 ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
-RUN git clone --depth=1 https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
+RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
+    && git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \
    && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
-    && uv pip uninstall hf-libero 2>/dev/null || true
+    && (uv pip uninstall hf-libero 2>/dev/null || true)
 ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"

+# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via
+# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All
+# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's
+# assets.zip (6.4 GB) under a deep author-internal prefix — extract and
+# flatten it under ${LIBERO_PLUS_ROOT}/assets.
 RUN python -c "\
 from huggingface_hub import hf_hub_download; \
 hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
                filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
    && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
-    && mv /tmp/libero-plus-dl/extract/inspire/hdd/project/embodied-multimodality/public/syfei/libero_new/release/dataset/LIBERO-plus-0/assets \
-          ${LIBERO_PLUS_ROOT}/assets \
+    && ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \
+    && mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \
    && rm -rf /tmp/libero-plus-dl

+# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are
+# non-interactive (it calls input() when the config is missing).
 RUN mkdir -p /home/user_lerobot/.libero \
    && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
       > /home/user_lerobot/.libero/config.yaml

+# Overlay the PR's source code on top of the nightly image.
 COPY --chown=user_lerobot:user_lerobot . .

 CMD ["/bin/bash"]
@@ -0,0 +1,71 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCasa365 integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code and RoboCasa-specific asset setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install robocasa + robosuite as editable clones. pip-installing from git
+# omits data files like robocasa/models/assets/box_links/box_links_assets.json
+# (not declared in package_data), which download_kitchen_assets needs at import.
+#
+# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
+# in install_requires, which would shadow the editable lerobot baked into
+# this image. We install robocasa's actual runtime deps explicitly instead.
+# Pinned SHAs for reproducible benchmark runs. Bump when you need an
+# upstream fix; don't rely on `main`/`master` drift.
+ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
+ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
+RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
+    git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
+    git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
+    git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
+    uv pip install --no-cache -e ~/robocasa --no-deps && \
+    uv pip install --no-cache -e ~/robosuite && \
+    uv pip install --no-cache \
+      "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
+      "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
+      "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
+      "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
+      "tianshou==0.4.10" "gymnasium==1.2.3"
+
+# Set up robocasa macros and download kitchen assets. We need:
+#   - tex              : base environment textures
+#   - tex_generative   : AI-generated textures; kitchen fixture XMLs embed
+#                        refs to generative_textures/wall/tex*.png
+#                        unconditionally, so MjModel.from_xml_string fails
+#                        at reset time without them (even if the env is
+#                        constructed with generative_textures=None).
+#   - fixtures_lw      : lightwheel kitchen fixtures (fridge, counters...)
+#   - objs_lw          : lightwheel object meshes (stools, misc props)
+# We skip the objaverse/aigen object packs (~30GB combined) by pairing
+# this with --env.obj_registries=["lightwheel"] on the lerobot side.
+# The download script prompts interactively, so pipe 'y' to auto-accept.
+RUN python -m robocasa.scripts.setup_macros && \
+    yes y | python -m robocasa.scripts.download_kitchen_assets \
+      --type tex tex_generative fixtures_lw objs_lw
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
+# replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,43 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCerebra integration tests.
+# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different
+# rename_map, so this image is identical to the LIBERO benchmark image —
+# extends the nightly GPU base with LIBERO assets + the PR's source code.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
+# runtime (which times out on CI). Point the libero config at the cached path.
+# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
+# so we write the config before any libero import can happen.
+RUN LIBERO_DIR=$(python -c \
+      "import importlib.util, os; s=importlib.util.find_spec('libero'); \
+       print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
+    mkdir -p /home/user_lerobot/.libero && \
+    python -c "\
+from huggingface_hub import snapshot_download; \
+snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
+                  local_dir='/home/user_lerobot/.libero/assets')" && \
+    printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
+    > /home/user_lerobot/.libero/config.yaml
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -12,11 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# Benchmark image for RoboMME integration tests.
+# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system
+# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all]
+# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which
+# conflict with lerobot's defaults; both are safe at runtime:
+#   - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26)
+#   - numpy 1.26.4 is API-compatible with lerobot's actual usage.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ...
+
 FROM huggingface/lerobot-gpu:latest

+# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering.
 ENV NVIDIA_DRIVER_CAPABILITIES=all \
    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json

+# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image.
 USER root
 RUN apt-get update \
    && apt-get install -y --no-install-recommends \
@@ -27,6 +40,9 @@ RUN apt-get update \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 USER user_lerobot

+# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top
+# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its
+# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml).
 COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
 RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
    && uv pip install --no-cache --override /tmp/robomme_override.txt \
@@ -34,6 +50,7 @@ RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
         "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
    && python -c "import robomme; print('robomme import OK')"

+# Overlay the PR's source code on top of the nightly image.
 COPY --chown=user_lerobot:user_lerobot . .

 CMD ["/bin/bash"]
@@ -0,0 +1,138 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboTwin 2.0 integration tests.
+# Extends the nightly GPU image with the RoboTwin simulator stack:
+#   sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
+# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
+#
+# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
+# Run:   docker run --gpus all --rm lerobot-benchmark-robotwin \
+#            lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
+
+FROM huggingface/lerobot-gpu:latest
+
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
+    ROBOTWIN_ROOT=/opt/robotwin
+
+# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
+# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
+USER root
+# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
+# an upstream fix; don't rely on `main` drift.
+ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
+         libvulkan1 vulkan-tools \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
+    && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+# RoboTwin runtime deps (av is already in the base via [av-dep]).
+RUN uv pip install --no-cache \
+        "sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
+        "open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
+
+# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
+RUN uv pip install --no-cache --no-build-isolation \
+        "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+
+# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
+# build aborts on an empty arch list. RoboTwin's own installer pins v0.7.8,
+# which still exposes the v1 API (`curobo.types.math`) that RoboTwin imports.
+ARG CUROBO_REF=v0.7.8
+RUN cd ${ROBOTWIN_ROOT}/envs \
+    && git clone --branch ${CUROBO_REF} --depth 1 https://github.com/NVlabs/curobo.git \
+    && cd curobo \
+    && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
+       uv pip install -e . --no-build-isolation --no-cache
+
+# Upstream patches (mirror RoboTwin's script/_install.sh).
+# These patches target the exact versions pinned above; re-check when upgrading.
+# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
+#   Safe to remove once mplib > 0.2.1 ships with the fix upstream.
+# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
+#   Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
+RUN python - <<'EOF'
+import pathlib, re, site
+for d in site.getsitepackages():
+    p = pathlib.Path(d) / "mplib" / "planner.py"
+    if p.exists():
+        p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
+        print(f"mplib patch applied: {p}")
+    p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
+    if p.exists():
+        src = p.read_text().replace(
+            "with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
+        ).replace('"srdf"', '".srdf"')
+        p.write_text(src)
+        print(f"sapien patch applied: {p}")
+EOF
+
+# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
+# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
+# The dataset is public — no auth token needed.
+RUN python - <<'EOF'
+import os, pathlib, zipfile
+from huggingface_hub import hf_hub_download
+
+assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
+assets_dir.mkdir(parents=True, exist_ok=True)
+for fname in ("embodiments.zip", "objects.zip"):
+    local = hf_hub_download(
+        repo_id="TianxingChen/RoboTwin2.0",
+        repo_type="dataset",
+        filename=fname,
+        local_dir=str(assets_dir),
+    )
+    with zipfile.ZipFile(local, "r") as z:
+        z.extractall(str(assets_dir))
+    pathlib.Path(local).unlink()
+EOF
+
+WORKDIR ${ROBOTWIN_ROOT}
+RUN python script/update_embodiment_config_path.py
+
+ENV PYTHONPATH="${ROBOTWIN_ROOT}"
+
+# Fail the image build early if the CuRobo package layout regresses. Importing
+# RoboTwin's planner here is too eager because CuRobo constructs CUDA-backed
+# defaults at import time, while Docker builds don't have access to an NVIDIA
+# driver.
+RUN python - <<'EOF'
+from pathlib import Path
+
+from curobo.types.math import Pose
+
+planner_src = (Path("/opt/robotwin/envs/robot/planner.py")).read_text()
+assert "from curobo.types.math import Pose as CuroboPose" in planner_src
+
+print("CuRobo import OK:", Pose.__name__)
+print("RoboTwin planner import references curobo.types.math")
+EOF
+
+# Return to the lerobot source directory (set by base image) before overlaying.
+WORKDIR /lerobot
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for VLABench integration tests.
+# Extends the nightly GPU image with the PR's source code and VLABench setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench .
+# Run:    docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control.
+# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI).
+# Editable install (-e) because VLABench/utils/ has no __init__.py, so
+# find_packages() omits it from wheels; editable mode uses the source tree directly.
+# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py).
+# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import
+# time. Guard the call so missing dirs return [] instead of crashing (in case
+# the asset download is partial).
+#
+# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need
+# an upstream fix; don't rely on `main`/`develop` drift.
+ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017
+ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85
+RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \
+    git -C ~/VLABench checkout ${VLABENCH_SHA} && \
+    git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \
+    git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \
+    python3 -c "\
+import pathlib; \
+p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \
+t = p.read_text(); \
+p.write_text(t.replace( \
+    'subdirs = os.listdir(xml_dir)', \
+    'if not os.path.isdir(xml_dir): return []\n    subdirs = os.listdir(xml_dir)'))" && \
+    uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \
+      mujoco==3.2.2 dm-control==1.0.22 \
+      open3d colorlog scikit-learn openai gdown
+
+# Download VLABench mesh assets. Task configs reference object meshes
+# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without
+# them the task builder picks from an empty mesh list and crashes with
+# IndexError at task-build time (random.choice([]) in config_manager.py).
+#
+# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time
+# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll
+# snapshot_download the repo into VLABench's assets dir. This is the reliable
+# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have
+# viewed or downloaded this file recently") on shared academic files.
+#
+# After download we *validate* that at least one XML exists under each
+# task-critical subtree and fail the build loudly if not. Silent-empty asset
+# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them
+# here rather than after a 10-minute eval build.
+#
+# Fallback: VLABench's own gdown-based script. Best-effort only.
+ARG VLABENCH_ASSETS_REPO=""
+RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \
+    if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \
+        echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \
+        uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \
+        python -c "from huggingface_hub import snapshot_download; \
+p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \
+    local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \
+print('snapshot_download returned:', p)"; \
+    else \
+        echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \
+        python ~/VLABench/scripts/download_assets.py --choice all; \
+    fi && \
+    python -c "\
+from pathlib import Path; \
+import sys; \
+root = Path('${ASSETS_DIR}'); \
+checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \
+failed = []; \
+print(f'Validating VLABench assets under {root}'); \
+[print(f'  {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \
+[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \
+sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')"
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with VLABenchEnv registration
+# and updated obs handling) replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
@@ -77,10 +77,22 @@
    title: Adding a New Benchmark
  - local: libero
    title: LIBERO
+  - local: libero_plus
+    title: LIBERO-plus
  - local: metaworld
    title: Meta-World
+  - local: robotwin
+    title: RoboTwin 2.0
+  - local: robocasa
+    title: RoboCasa365
+  - local: robocerebra
+    title: RoboCerebra
+  - local: robomme
+    title: RoboMME
  - local: envhub_isaaclab_arena
    title: NVIDIA IsaacLab Arena Environments
+  - local: vlabench
+    title: VLABench
  title: "Benchmarks"
 - sections:
  - local: introduction_processors
@@ -685,6 +685,10 @@ Example configuration for training the [reward classifier](https://huggingface.c

 ```json
 {
+  "dataset": {
+    "repo_id": "hf_username/dataset_name",
+    "root": null
+  },
  "policy": {
    "type": "reward_classifier",
    "model_name": "helper2424/resnet10",
@@ -705,8 +709,28 @@ Example configuration for training the [reward classifier](https://huggingface.c
        "type": "VISUAL",
        "shape": [3, 128, 128]
      }
-    }
-  }
+    },
+    "push_to_hub": true,
+    "repo_id": "hf_username/model_repo"
+  },
+  "batch_size": 16,
+  "num_workers": 4,
+  "steps": 5000,
+  "log_freq": 10,
+  "eval_freq": 1000,
+  "save_freq": 1000,
+  "save_checkpoint": true,
+  "seed": 2,
+  "resume": false,
+  "optimizer": {
+    "grad_clip_norm": 10.0
+  },
+  "wandb": {
+    "enable": true,
+    "project": "reward-classifier",
+    "disable_artifact": false
+  },
+  "job_name": "reward-classifier"
 }
 ```

@@ -32,6 +32,12 @@ Once you’ve gathered enough trajectories, you’ll train a neural network to i

 If you run into any issues at any point, jump into our [Discord community](https://discord.com/invite/s3KuuzsPFb) for support.

+<Tip>
+
+Want to quickly get the right commands for your setup? The [quickstart notebook](https://github.com/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb) lets you configure your robot once and generates all the commands below ready to paste.
+
+</Tip>
+
 ## Set up and Calibrate

 If you haven't yet set up and calibrated your robot and teleop device, please do so by following the robot-specific tutorial.
@@ -0,0 +1,188 @@
+# LIBERO-plus
+
+LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss.
+
+- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626)
+- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus)
+- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
+
+![An overview of the LIBERO-plus benchmark perturbation dimensions](https://github.com/sylvestf/LIBERO-plus/raw/main/static/images/libero-plus.jpg)
+
+## Perturbation dimensions
+
+LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes:
+
+| Dimension             | What changes                                          |
+| --------------------- | ----------------------------------------------------- |
+| Objects layout        | Target position, presence of confounding objects      |
+| Camera viewpoints     | Camera position, orientation, field-of-view           |
+| Robot initial states  | Manipulator start pose                                |
+| Language instructions | LLM-rewritten task description (paraphrase / synonym) |
+| Light conditions      | Intensity, direction, color, shadow                   |
+| Background textures   | Scene surface and object appearance                   |
+| Sensor noise          | Photometric distortions and image degradation         |
+
+## Available task suites
+
+LIBERO-plus covers the same five suites as LIBERO:
+
+| Suite          | CLI name         | Tasks | Max steps | Description                                        |
+| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- |
+| LIBERO-Spatial | `libero_spatial` | 10    | 280       | Tasks requiring reasoning about spatial relations  |
+| LIBERO-Object  | `libero_object`  | 10    | 280       | Tasks centered on manipulating different objects   |
+| LIBERO-Goal    | `libero_goal`    | 10    | 300       | Goal-conditioned tasks with changing targets       |
+| LIBERO-90      | `libero_90`      | 90    | 400       | Short-horizon tasks from the LIBERO-100 collection |
+| LIBERO-Long    | `libero_10`      | 10    | 520       | Long-horizon tasks from the LIBERO-100 collection  |
+
+<Tip warning={true}>
+  Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero`
+  so that `import libero` resolves to the LIBERO-plus fork. You cannot have both
+  installed at the same time. To switch back to vanilla LIBERO, uninstall the
+  fork and reinstall with `pip install -e ".[libero]"`.
+</Tip>
+
+## Installation
+
+### System dependencies (Linux only)
+
+```bash
+sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev
+```
+
+### Python package
+
+```bash
+pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym
+git clone https://github.com/sylvestf/LIBERO-plus.git
+cd LIBERO-plus && pip install --no-deps -e .
+pip uninstall -y hf-libero  # so `import libero` resolves to the fork
+```
+
+LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported.
+
+<Tip>
+Set the MuJoCo rendering backend before running evaluation:
+
+```bash
+export MUJOCO_GL=egl   # headless / HPC / cloud
+```
+
+</Tip>
+
+### Download LIBERO-plus assets
+
+LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory:
+
+```bash
+# After installing the package, find where it was installed:
+python -c "import libero; print(libero.__file__)"
+# Then extract assets.zip into <package_root>/libero/assets/
+```
+
+## Evaluation
+
+### Default evaluation (recommended)
+
+Evaluate across the four standard suites (10 episodes per task):
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial,libero_object,libero_goal,libero_10 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --env.max_parallel_tasks=1
+```
+
+### Single-suite evaluation
+
+Evaluate on one LIBERO-plus suite:
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.).
+- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
+- `--eval.batch_size` controls how many environments run in parallel.
+- `--eval.n_episodes` sets how many episodes to run per task.
+
+### Multi-suite evaluation
+
+Benchmark a policy across multiple suites at once by passing a comma-separated list:
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial,libero_object \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+### Control mode
+
+LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
+
+```bash
+--env.control_mode=relative   # or "absolute"
+```
+
+### Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
+- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
+- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
+
+### Recommended evaluation episodes
+
+For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
+
+## Training
+
+### Dataset
+
+A LeRobot-format training dataset for LIBERO-plus is available at:
+
+- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
+
+### Example training command
+
+```bash
+lerobot-train \
+    --policy.type=smolvla \
+    --policy.repo_id=${HF_USER}/smolvla_libero_plus \
+    --policy.load_vlm_weights=true \
+    --dataset.repo_id=lerobot/libero_plus \
+    --env.type=libero_plus \
+    --env.task=libero_spatial \
+    --output_dir=./outputs/ \
+    --steps=100000 \
+    --batch_size=4 \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1 \
+    --eval_freq=1000
+```
+
+## Relationship to LIBERO
+
+LIBERO-plus is a drop-in extension of LIBERO:
+
+- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`)
+- Same camera names and observation/action format
+- Same task suite names
+- Installs under the same `libero` Python package name (different GitHub repo)
+
+To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`.
@@ -0,0 +1,188 @@
+# RoboCasa365
+
+[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
+
+- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
+- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
+- Project website: [robocasa.ai](https://robocasa.ai)
+- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
+- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
+
+<img
+  src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
+  alt="RoboCasa365 benchmark overview"
+  width="85%"
+/>
+
+## Available tasks
+
+RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
+
+| Family    | Tasks | Description                                                                     |
+| --------- | ----- | ------------------------------------------------------------------------------- |
+| Atomic    | ~65   | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
+| Composite | ~300  | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc.     |
+
+**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
+
+**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
+
+`--env.task` accepts three forms:
+
+- a single task name (`CloseFridge`)
+- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
+- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
+
+## Installation
+
+RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/robocasa/robocasa.git ~/robocasa
+git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
+pip install -e ~/robocasa --no-deps
+pip install -e ~/robosuite
+
+# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
+# the bad lerobot pin).
+pip install numpy numba scipy mujoco pygame Pillow opencv-python \
+            pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
+            tianshou gymnasium
+
+python -m robocasa.scripts.setup_macros
+# Lightweight assets (lightwheel object meshes + textures). Enough for
+# the default env out of the box.
+python -m robocasa.scripts.download_kitchen_assets \
+  --type tex tex_generative fixtures_lw objs_lw
+# Optional: full objaverse/aigen registries (~30GB) for richer object
+# variety. Enable at eval time via --env.obj_registries (see below).
+# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
+```
+
+<Tip>
+RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+### Object registries
+
+By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
+
+```bash
+--env.obj_registries='[objaverse,lightwheel]'
+```
+
+## Evaluation
+
+All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Benchmark-group evaluation
+
+Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=atomic_seen \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
+
+## Policy inputs and outputs
+
+**Observations** (raw RoboCasa camera names are preserved verbatim):
+
+- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
+- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
+- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
+- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
+
+## Training
+
+### Single-task example
+
+A ready-to-use single-task dataset is on the Hub:
+[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
+
+Fine-tune a SmolVLA base on `CloseFridge`:
+
+```bash
+lerobot-train \
+  --policy.type=smolvla \
+  --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --policy.load_vlm_weights=true \
+  --policy.push_to_hub=true \
+  --dataset.repo_id=pepijn223/robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --output_dir=./outputs/smolvla_robocasa_CloseFridge \
+  --steps=100000 \
+  --batch_size=4 \
+  --eval_freq=5000 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5 \
+  --save_freq=10000
+```
+
+Evaluate the resulting checkpoint:
+
+```bash
+lerobot-eval \
+  --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
@@ -0,0 +1,99 @@
+# RoboCerebra
+
+[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF).
+
+- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677)
+- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/)
+- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks.
+- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra)
+
+## Available tasks
+
+RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite:
+
+| Suite     | CLI name    | Tasks | Description                                                   |
+| --------- | ----------- | ----- | ------------------------------------------------------------- |
+| LIBERO-10 | `libero_10` | 10    | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals |
+
+Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals.
+
+## Installation
+
+RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need:
+
+```bash
+pip install -e ".[libero]"
+```
+
+<Tip>
+RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+## Evaluation
+
+RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocerebra \
+  --env.type=libero \
+  --env.task=libero_10 \
+  --env.fps=20 \
+  --env.obs_type=pixels_agent_pos \
+  --env.observation_height=256 \
+  --env.observation_width=256 \
+  '--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \
+  --policy.empty_cameras=1
+```
+
+### Recommended evaluation episodes
+
+**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper.
+
+## Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper)
+- `observation.images.image` — third-person view, 256×256 HWC uint8
+- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D)
+
+## Training
+
+The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations:
+
+| Feature                          | Shape         | Description          |
+| -------------------------------- | ------------- | -------------------- |
+| `observation.images.image`       | (256, 256, 3) | Third-person view    |
+| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera |
+| `observation.state`              | (8,)          | Joint pos + gripper  |
+| `action`                         | (7,)          | EEF delta + gripper  |
+
+Fine-tune a SmolVLA base on it:
+
+```bash
+lerobot-train \
+  --policy.path=lerobot/smolvla_base \
+  --dataset.repo_id=lerobot/robocerebra_unified \
+  --env.type=libero \
+  --env.task=libero_10 \
+  --output_dir=outputs/smolvla_robocerebra
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.
@@ -0,0 +1,130 @@
+# RoboMME
+
+[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation.
+
+- **16 tasks** across 4 memory-skill suites
+- **1,600 training demos** (100 per task, 50 val, 50 test)
+- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps
+- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only
+
+![RoboMME benchmark tasks overview](https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2603.04639/gradient.png)
+
+## Tasks
+
+| Suite                             | Tasks                                                         |
+| --------------------------------- | ------------------------------------------------------------- |
+| **Counting** (temporal memory)    | BinFill, PickXtimes, SwingXtimes, StopCube                    |
+| **Permanence** (spatial memory)   | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap  |
+| **Reference** (object memory)     | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder |
+| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick                  |
+
+## Installation
+
+> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts.
+
+### Native (Linux)
+
+```bash
+pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \
+  -e '.[smolvla,av-dep]' \
+  'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main'
+```
+
+> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely.
+
+### Docker (recommended)
+
+```bash
+# Build base image first (from repo root)
+docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base .
+
+# Build RoboMME eval image (applies gymnasium + numpy pin overrides)
+docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme .
+```
+
+The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage.
+
+## Running Evaluation
+
+### Default (single task, single episode)
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_repo> \
+    --env.type=robomme \
+    --env.task=PickXtimes \
+    --env.dataset_split=test \
+    --env.task_ids=[0] \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1
+```
+
+### Multi-task evaluation
+
+Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split.
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_repo> \
+    --env.type=robomme \
+    --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+    --env.dataset_split=test \
+    --env.task_ids=[0,1,2,3,4,5,6,7,8,9] \
+    --eval.batch_size=1 \
+    --eval.n_episodes=50
+```
+
+### Key CLI options for `env.type=robomme`
+
+| Option               | Default       | Description                                        |
+| -------------------- | ------------- | -------------------------------------------------- |
+| `env.task`           | `PickXtimes`  | Any of the 16 task names above (comma-separated)   |
+| `env.dataset_split`  | `test`        | `train`, `val`, or `test`                          |
+| `env.action_space`   | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D)             |
+| `env.episode_length` | `300`         | Max steps per episode                              |
+| `env.task_ids`       | `null`        | List of episode indices to evaluate (null = `[0]`) |
+
+## Dataset
+
+The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly:
+
+```python
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+dataset = LeRobotDataset("lerobot/robomme")
+```
+
+### Dataset features
+
+| Feature            | Shape         | Description                     |
+| ------------------ | ------------- | ------------------------------- |
+| `image`            | (256, 256, 3) | Front camera RGB                |
+| `wrist_image`      | (256, 256, 3) | Wrist camera RGB                |
+| `actions`          | (8,)          | Joint angles + gripper          |
+| `state`            | (8,)          | Joint positions + gripper state |
+| `simple_subgoal`   | str           | High-level language annotation  |
+| `grounded_subgoal` | str           | Grounded language annotation    |
+| `episode_index`    | int           | Episode ID                      |
+| `frame_index`      | int           | Frame within episode            |
+
+### Feature key alignment (training)
+
+The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`.
+
+The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning.
+
+## Action Spaces
+
+| Type          | Dim | Description                                               |
+| ------------- | --- | --------------------------------------------------------- |
+| `joint_angle` | 8   | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) |
+| `ee_pose`     | 7   | xyz + roll/pitch/yaw + gripper                            |
+
+Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`.
+
+## Platform Notes
+
+- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported.
+- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed.
+- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically.
+- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package.
@@ -0,0 +1,223 @@
+# RoboTwin 2.0
+
+RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
+
+- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
+- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
+- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
+- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
+
+![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png)
+
+## Overview
+
+| Property      | Value                                                    |
+| ------------- | -------------------------------------------------------- |
+| Tasks         | 50 dual-arm manipulation tasks                           |
+| Robot         | Aloha-AgileX bimanual (14 DOF, 7 per arm)                |
+| Action space  | 14-dim joint-space, continuous in `[-1, 1]`              |
+| Cameras       | `head_camera`, `left_camera`, `right_camera`             |
+| Simulator     | SAPIEN (not MuJoCo)                                      |
+| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations          |
+| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
+
+## Available tasks
+
+RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
+
+| Task                     | CLI name                 | Category          |
+| ------------------------ | ------------------------ | ----------------- |
+| Beat block with hammer   | `beat_block_hammer`      | Tool use          |
+| Click bell / alarm clock | `click_bell`             | Precision press   |
+| Stack blocks (2 / 3)     | `stack_blocks_two/three` | Stacking          |
+| Stack bowls (2 / 3)      | `stack_bowls_two/three`  | Stacking          |
+| Handover block / mic     | `handover_block`         | Bimanual coord.   |
+| Lift pot                 | `lift_pot`               | Bimanual lift     |
+| Shake bottle             | `shake_bottle`           | Continuous motion |
+| Turn switch              | `turn_switch`            | Articulated obj   |
+| Stamp seal               | `stamp_seal`             | Precision place   |
+| Scan object              | `scan_object`            | Mobile manip.     |
+
+Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
+
+<Tip warning={true}>
+  `open_laptop` is currently broken upstream (its `check_success()` uses
+  `self.arm_tag`, which is only set inside the scripted-expert `play_once()`
+  path and therefore unavailable during normal policy eval). Avoid it until the
+  upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
+  `load_actors()`.
+</Tip>
+
+## Dataset
+
+The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
+
+```
+lerobot/robotwin_unified
+```
+
+It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
+
+You can load it directly with the HF Datasets library:
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("lerobot/robotwin_unified", split="train")
+```
+
+## Installation
+
+RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
+
+### 1. Create a conda environment
+
+```bash
+conda create -n robotwin python=3.10 -y
+conda activate robotwin
+```
+
+### 2. Install LeRobot
+
+```bash
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
+pip install -e "."
+```
+
+### 3. Install RoboTwin 2.0
+
+```bash
+git clone https://github.com/RoboTwin-Platform/RoboTwin.git
+cd RoboTwin
+bash script/_install.sh
+bash script/_download_assets.sh
+```
+
+The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
+
+<Tip warning={true}>
+If the automated install fails, install manually:
+
+```bash
+pip install -r requirements.txt
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
+pip install -e . --no-build-isolation
+```
+
+Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
+
+</Tip>
+
+### 4. Add RoboTwin to PYTHONPATH
+
+The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
+
+```bash
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+```
+
+Add this to your shell profile to make it permanent.
+
+## Evaluation
+
+### Standard evaluation (recommended)
+
+Evaluate a policy on a single task with the official protocol (100 episodes):
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Single-task quick check
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5
+```
+
+### Multi-task sweep
+
+Evaluate on several tasks in one run:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Full benchmark (all 50 tasks)
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+<Tip>
+  `open_laptop` is intentionally omitted above because of the upstream
+  `self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
+  upstream fix lands.
+</Tip>
+
+## Camera configuration
+
+By default, all three cameras are included:
+
+| Camera key     | Description                    |
+| -------------- | ------------------------------ |
+| `head_camera`  | Torso-mounted overhead view    |
+| `left_camera`  | Left arm wrist-mounted camera  |
+| `right_camera` | Right arm wrist-mounted camera |
+
+To use a subset of cameras, override `--env.camera_names`:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --env.camera_names="head_camera,left_camera" \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+## Environment config reference
+
+Key parameters for `RoboTwinEnvConfig`:
+
+| Parameter            | Default                                  | Description                        |
+| -------------------- | ---------------------------------------- | ---------------------------------- |
+| `task`               | `"beat_block_hammer"`                    | Comma-separated task name(s)       |
+| `fps`                | `25`                                     | Simulation FPS                     |
+| `episode_length`     | `300`                                    | Max steps per episode              |
+| `obs_type`           | `"pixels_agent_pos"`                     | `"pixels"` or `"pixels_agent_pos"` |
+| `camera_names`       | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras     |
+| `observation_height` | `240`                                    | Camera pixel height                |
+| `observation_width`  | `320`                                    | Camera pixel width                 |
+
+## Leaderboard submission
+
+Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
+
+- Training on 50 `demo_clean` demonstrations per task
+- Evaluating 100 episodes per task
+- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
+
+For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
@@ -0,0 +1,176 @@
+# VLABench
+
+[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below).
+
+- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194)
+- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench)
+- Project website: [vlabench.github.io](https://vlabench.github.io)
+- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench)
+
+<img
+  src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/vlabench.png"
+  alt="VLABench benchmark overview"
+  width="85%"
+/>
+
+## Available tasks
+
+VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface:
+
+| Suite     | CLI name    | Tasks | Description                                                      |
+| --------- | ----------- | ----- | ---------------------------------------------------------------- |
+| Primitive | `primitive` | 21    | Single / few-skill combinations (select, insert, physics QA)     |
+| Composite | `composite` | 22    | Multi-step reasoning and long-horizon planning (cook, rearrange) |
+
+**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`).
+
+**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`.
+
+`--env.task` accepts three forms:
+
+- a single task name (`select_fruit`)
+- a comma-separated list (`select_fruit,heat_food`)
+- a suite shortcut (`primitive`, `composite`, or `primitive,composite`)
+
+## Installation
+
+VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets:
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench
+git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms
+pip install -e ~/VLABench -e ~/rrt-algorithms
+pip install "mujoco==3.2.2" "dm-control==1.0.22" \
+            open3d colorlog scikit-learn openai gdown
+
+python ~/VLABench/scripts/download_assets.py
+```
+
+<Tip>
+VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+## Evaluation
+
+All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=select_fruit \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=select_fruit,select_toy,add_condiment,heat_food \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Suite-wide evaluation
+
+Run an entire suite (all 21 primitives or all 22 composites):
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=primitive \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  --env.max_parallel_tasks=1 \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+Or both suites:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=primitive,composite \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  --env.max_parallel_tasks=1 \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper.
+
+## Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper)
+- `observation.images.image` — front camera, 480×480 HWC uint8
+- `observation.images.second_image` — second camera, 480×480 HWC uint8
+- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper.
+
+## Training
+
+### Datasets
+
+Pre-collected VLABench datasets in LeRobot format on the Hub:
+
+- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images.
+- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images.
+
+### Example training command
+
+Fine-tune a SmolVLA base on the primitive suite:
+
+```bash
+lerobot-train \
+  --policy.type=smolvla \
+  --policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \
+  --policy.load_vlm_weights=true \
+  --policy.push_to_hub=true \
+  --dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \
+  --env.type=vlabench \
+  --env.task=select_fruit \
+  --output_dir=./outputs/smolvla_vlabench_primitive \
+  --steps=100000 \
+  --batch_size=4 \
+  --eval_freq=5000 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=1 \
+  --save_freq=10000
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark.
@@ -220,7 +220,7 @@ REAL_DIM = 12
 # Postprocessing: Trim 20D predictions to 12D for deployment
 ```

-See the [action_hub.py](/home/jade_choghari/robot/lerobot/src/lerobot/policies/xvla/action_hub.py) implementation for details.
+See the [action_hub.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py) implementation for details.

 #### Auto Action Mode (Recommended)

@@ -519,9 +519,9 @@ If you use X-VLA in your research, please cite:

 - [X-VLA Paper](https://arxiv.org/pdf/2510.10274)
 - [LeRobot Documentation](https://github.com/huggingface/lerobot)
- [Action Registry Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/action_hub.py)
- [Processor Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/processor_xvla.py)
- [Model Configuration](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/configuration_xvla.py)
+- [Action Registry Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py)
+- [Processor Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/processor_xvla.py)
+- [Model Configuration](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/configuration_xvla.py)

 ## Contributing

@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗 LeRobot Quickstart\n",
+    "\n",
+    "Calibration → teleoperation → data collection → training → evaluation.\n",
+    "\n",
+    "Install the required dependencies: `pip install -e .[notebook,dataset,training,viz,hardware]`.\n",
+    "\n",
+    "**How to use:**\n",
+    "1. Edit the **Configuration** cell with your settings.\n",
+    "2. Run all cells (`Run All`).\n",
+    "3. Each section prints a ready-to-paste terminal command - copy it and run it.\n",
+    "\n",
+    "Each setup is different, please refer to the [LeRobot documentation](https://huggingface.co/docs/lerobot/il_robots) for more details on each step and available options. <br>\n",
+    "Feel free to make this notebook your own and adapt it to your needs!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _cameras_arg(cameras: dict) -> str:\n",
+    "    if not cameras:\n",
+    "        return \"\"\n",
+    "    entries = [f\"{n}: {{{', '.join(f'{k}: {v}' for k, v in cfg.items())}}}\" for n, cfg in cameras.items()]\n",
+    "    return \"{ \" + \", \".join(entries) + \" }\"\n",
+    "\n",
+    "\n",
+    "def print_cmd(*parts: str) -> None:\n",
+    "    \"\"\"Print a shell command with line continuations, skipping empty parts.\"\"\"\n",
+    "    non_empty = [p for p in parts if p]\n",
+    "    print(\" \\\\\\n    \".join(non_empty))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Configuration\n",
+    "\n",
+    "Edit this cell, then **Run All** to generate all commands below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Robot (follower) - run `lerobot-find-port` to discover the port\n",
+    "ROBOT_TYPE = \"so101_follower\"\n",
+    "ROBOT_PORT = \"/dev/ttyACM0\"\n",
+    "ROBOT_ID = \"my_follower_arm\"\n",
+    "\n",
+    "# Teleop (leader) - run `lerobot-find-port` to discover the port\n",
+    "TELEOP_TYPE = \"so101_leader\"\n",
+    "TELEOP_PORT = \"/dev/ttyACM1\"\n",
+    "TELEOP_ID = \"my_leader_arm\"\n",
+    "\n",
+    "# Cameras - set to {} to disable\n",
+    "# Run `lerobot-find-cameras opencv` to list available cameras and their indices\n",
+    "CAMERAS = {\n",
+    "    \"top\": {\"type\": \"opencv\", \"index_or_path\": 2, \"width\": 640, \"height\": 480, \"fps\": 30},\n",
+    "    \"wrist\": {\"type\": \"opencv\", \"index_or_path\": 4, \"width\": 640, \"height\": 480, \"fps\": 30},\n",
+    "}\n",
+    "\n",
+    "# Dataset\n",
+    "HF_USER = \"your_hf_username\"  # `huggingface-cli whoami` to find your username\n",
+    "DATASET_NAME = \"my_so101_dataset\"\n",
+    "TASK_DESCRIPTION = \"pick and place the block\"\n",
+    "NUM_EPISODES = 10\n",
+    "\n",
+    "# Training\n",
+    "POLICY_TYPE = \"act\"  # act, diffusion, smolvla, ...\n",
+    "POLICY_DEVICE = \"cuda\"  # cuda / cpu / mps\n",
+    "TRAIN_STEPS = 10_000\n",
+    "SAVE_FREQ = 2_000\n",
+    "OUTPUT_DIR = f\"outputs/train/{DATASET_NAME}\"\n",
+    "\n",
+    "# Inference - Hub repo ID or local checkpoint path\n",
+    "# e.g. set to f\"{OUTPUT_DIR}/checkpoints/last\" to use a local checkpoint\n",
+    "POLICY_PATH = f\"{HF_USER}/{DATASET_NAME}_{POLICY_TYPE}\"\n",
+    "LAST_CHECKPOINT_PATH = f\"{OUTPUT_DIR}/checkpoints/last\"\n",
+    "\n",
+    "# Derived\n",
+    "DATASET_REPO_ID = f\"{HF_USER}/{DATASET_NAME}\"\n",
+    "DATASET_ROOT = f\"data/{DATASET_NAME}\"\n",
+    "POLICY_REPO_ID = f\"{HF_USER}/{DATASET_NAME}_{POLICY_TYPE}\"\n",
+    "EVAL_REPO_ID = f\"{HF_USER}/eval_{DATASET_NAME}\"\n",
+    "CAMERAS_ARG = _cameras_arg(CAMERAS)\n",
+    "CAMERAS_FLAG = f'--robot.cameras=\"{CAMERAS_ARG}\"' if CAMERAS_ARG else \"\"\n",
+    "\n",
+    "print(f\"Robot  : {ROBOT_TYPE} @ {ROBOT_PORT}\")\n",
+    "print(f\"Teleop : {TELEOP_TYPE} @ {TELEOP_PORT}\")\n",
+    "print(f\"Cameras: {list(CAMERAS) or 'none'}\")\n",
+    "print(f\"Dataset: {DATASET_REPO_ID} ({NUM_EPISODES} episodes) saved to {DATASET_ROOT}\")\n",
+    "print(f\"Policy : {POLICY_TYPE} -> {POLICY_REPO_ID}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 1. Calibration\n",
+    "\n",
+    "Run once per arm before first use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Follower\n",
+    "print_cmd(\n",
+    "    \"lerobot-calibrate\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Leader\n",
+    "print_cmd(\n",
+    "    \"lerobot-calibrate\",\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 2. Teleoperation\n",
+    "\n",
+    "See the [teleoperation docs](https://huggingface.co/docs/lerobot/il_robots#teleoperate) and the [cameras guide](https://huggingface.co/docs/lerobot/cameras) for more options."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-teleoperate\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    \"--display_data=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Record Dataset\n",
+    "\n",
+    "See the [recording docs](https://huggingface.co/docs/lerobot/il_robots#record-a-dataset) for tips on gathering good data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    "    \"--display_data=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Resume a previously interrupted recording session\n",
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--dataset.root={DATASET_ROOT}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    "    \"--display_data=true\",\n",
+    "    \"--resume=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. Train Policy\n",
+    "\n",
+    "See the [training docs](https://huggingface.co/docs/lerobot/il_robots#train-a-policy) for configuration options and tips."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-train\",\n",
+    "    f\"--dataset.repo_id={DATASET_REPO_ID}\",\n",
+    "    f\"--policy.type={POLICY_TYPE}\",\n",
+    "    f\"--policy.device={POLICY_DEVICE}\",\n",
+    "    f\"--policy.repo_id={POLICY_REPO_ID}\",\n",
+    "    f\"--output_dir={OUTPUT_DIR}\",\n",
+    "    f\"--steps={TRAIN_STEPS}\",\n",
+    "    f\"--save_freq={SAVE_FREQ}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Resume a previously interrupted training session\n",
+    "print_cmd(\n",
+    "    \"lerobot-train\",\n",
+    "    f\"--config_path={LAST_CHECKPOINT_PATH}/pretrained_model/train_config.json\",\n",
+    "    \"--resume=true\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. Inference\n",
+    "\n",
+    "Uses `POLICY_PATH` from the Configuration cell (defaults to the Hub repo ID). You can also put there the `LAST_CHECKPOINT_PATH`.\n",
+    "\n",
+    "See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_cmd(\n",
+    "    \"lerobot-record\",\n",
+    "    f\"--policy.path={POLICY_PATH}\",\n",
+    "    f\"--robot.type={ROBOT_TYPE}\",\n",
+    "    f\"--robot.port={ROBOT_PORT}\",\n",
+    "    f\"--robot.id={ROBOT_ID}\",\n",
+    "    CAMERAS_FLAG,\n",
+    "    f\"--teleop.type={TELEOP_TYPE}\",\n",
+    "    f\"--teleop.port={TELEOP_PORT}\",\n",
+    "    f\"--teleop.id={TELEOP_ID}\",\n",
+    "    f\"--dataset.repo_id={EVAL_REPO_ID}\",\n",
+    "    f\"--dataset.num_episodes={NUM_EPISODES}\",\n",
+    "    f'--dataset.single_task=\"{TASK_DESCRIPTION}\"',\n",
+    "    \"--dataset.streaming_encoding=true\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lerobot (3.12.3)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -108,9 +108,9 @@ training = [
    "wandb>=0.24.0,<0.25.0",
 ]
 hardware = [
-    "pynput>=1.7.8,<1.9.0",
-    "pyserial>=3.5,<4.0",
-    "deepdiff>=7.0.1,<9.0.0",
+    "lerobot[pynput-dep]",
+    "lerobot[pyserial-dep]",
+    "lerobot[deepdiff-dep]",
 ]
 viz = [
    "rerun-sdk>=0.24.0,<0.27.0",
@@ -136,10 +136,14 @@ scipy-dep = ["scipy>=1.14.0,<2.0.0"]
 diffusers-dep = ["diffusers>=0.27.2,<0.36.0"]
 qwen-vl-utils-dep = ["qwen-vl-utils>=0.0.11,<0.1.0"]
 matplotlib-dep = ["matplotlib>=3.10.3,<4.0.0", "contourpy>=1.3.0,<2.0.0"] # NOTE: Explicitly listing contourpy helps the resolver converge faster.
+pyserial-dep = ["pyserial>=3.5,<4.0"]
+deepdiff-dep = ["deepdiff>=7.0.1,<9.0.0"]
+pynput-dep = ["pynput>=1.7.8,<1.9.0"]
+pyzmq-dep = ["pyzmq>=26.2.1,<28.0.0"]

 # Motors
-feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"]
-dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0"]
+feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
+dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
 damiao = ["lerobot[can-dep]"]
 robstride = ["lerobot[can-dep]"]

@@ -147,10 +151,11 @@ robstride = ["lerobot[can-dep]"]
 openarms = ["lerobot[damiao]"]
 gamepad = ["lerobot[pygame-dep]", "hidapi>=0.14.0,<0.15.0"]
 hopejr = ["lerobot[feetech]", "lerobot[pygame-dep]"]
-lekiwi = ["lerobot[feetech]", "pyzmq>=26.2.1,<28.0.0"]
+lekiwi = ["lerobot[feetech]", "lerobot[pyzmq-dep]"]
 unitree_g1 = [
    # "unitree-sdk2==1.0.1",
-    "pyzmq>=26.2.1,<28.0.0",
+    "lerobot[pyzmq-dep]",
+    "lerobot[pyserial-dep]",
    "onnxruntime>=1.16.0,<2.0.0",
    "onnx>=1.16.0,<2.0.0",
    "meshcat>=0.3.0,<0.4.0",
@@ -196,7 +201,8 @@ async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
 peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]

 # Development
-dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1", "ruff>=0.14.1"]
+dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1", "ruff>=0.14.1", "lerobot[notebook]"]
+notebook = ["jupyter>=1.0.0,<2.0.0", "ipykernel>=6.0.0,<7.0.0"]
 test = ["pytest>=8.1.0,<9.0.0", "pytest-timeout>=2.4.0,<3.0.0", "pytest-cov>=5.0.0,<8.0.0", "mock-serial>=0.0.1,<0.1.0 ; sys_platform != 'win32'"]
 video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]

@@ -206,6 +212,20 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
 pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
 libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
 metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
+# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
+# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
+# release), so any `vlabench>=X` pip spec is unresolvable. Install it
+# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx
+# for the recipe.
+# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
+# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
+# resolve into a single env. Install it only in the RoboMME Docker image
+# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme).
+# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
+# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
+# workspace `lerobot` and makes the graph unsolvable under any resolver
+# (uv, pip). Install it manually alongside robosuite — see
+# docs/source/robocasa.mdx for the recipe.

 # All
 all = [
@@ -35,9 +35,11 @@ import re
 import sys
 from pathlib import Path

-
 # LIBERO-plus derives task.language by space-joining the perturbation-variant
-# filename, so strip the perturbation metadata blob to recover the base prompt.
+# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py),
+# so non-_language_ variants inherit a trailing metadata blob like
+# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so
+# the description matches the base instruction used in the training dataset.
 _LIBERO_PERTURBATION_TAIL_RE = re.compile(
    r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
 )
@@ -72,29 +74,120 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
    return {f"{task_name}_0": label}


-def _robomme_descriptions(task_names: str) -> dict[str, str]:
-    return {
-        f"{task_name}_0": task_name.replace("_", " ").strip()
-        for task_name in (task.strip() for task in task_names.split(","))
-        if task_name
-    }
+def _robotwin_descriptions(task_names: str) -> dict[str, str]:
+    """Return descriptions for each requested RoboTwin task. Reads
+    `description/task_instruction/<task>.json` from the RoboTwin clone
+    (cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
+    out: dict[str, str] = {}
+    root = Path("description/task_instruction")
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc_file = root / f"{name}.json"
+        desc = name.replace("_", " ")
+        if desc_file.is_file():
+            data = json.loads(desc_file.read_text())
+            full = data.get("full_description") or desc
+            # Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
+            desc = full.replace("<", "").replace(">", "")
+        out[f"{name}_0"] = desc
+    return out
+
+
+def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
+    """For each task in the comma-separated list, emit a cleaned-name label.
+
+    RoboCasa episodes carry their language instruction in the env's
+    `ep_meta['lang']`, populated per reset. Pulling it requires spinning
+    up the full kitchen env per task (~seconds each); we use the task
+    name as the key here and let the eval's episode info carry the
+    actual instruction.
+    """
+    out: dict[str, str] = {}
+    for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+        # Split CamelCase into words: "CloseFridge" → "close fridge".
+        label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
+        out[f"{task}_0"] = label or task
+    return out
+
+
+_ROBOMME_DESCRIPTIONS = {
+    "BinFill": "Fill the target bin with the correct number of cubes",
+    "PickXtimes": "Pick the indicated cube the specified number of times",
+    "SwingXtimes": "Swing the object the specified number of times",
+    "StopCube": "Grasp and stop the moving cube",
+    "VideoUnmask": "Pick the cube shown in the reference video",
+    "VideoUnmaskSwap": "Pick the cube matching the reference video after a swap",
+    "ButtonUnmask": "Press the button indicated by the reference",
+    "ButtonUnmaskSwap": "Press the correct button after objects are swapped",
+    "PickHighlight": "Pick the highlighted cube",
+    "VideoRepick": "Repick the cube shown in the reference video",
+    "VideoPlaceButton": "Place the cube on the button shown in the video",
+    "VideoPlaceOrder": "Place cubes in the order shown in the video",
+    "MoveCube": "Move the cube to the target location",
+    "InsertPeg": "Insert the peg into the target hole",
+    "PatternLock": "Unlock the pattern by pressing buttons in sequence",
+    "RouteStick": "Route the stick through the required waypoints",
+}
+
+
+def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]:
+    """Return descriptions for each requested RoboMME task. Keys match the
+    video filename pattern `<task>_<task_id>` used by the eval script."""
+    if task_ids is None:
+        task_ids = [0]
+    out: dict[str, str] = {}
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc = _ROBOMME_DESCRIPTIONS.get(name, name)
+        for tid in task_ids:
+            out[f"{name}_{tid}"] = desc
+    return out
+
+
+def _vlabench_descriptions(task_spec: str) -> dict[str, str]:
+    """For each task in the comma-separated list, emit a cleaned-name label.
+
+    VLABench tasks carry language instructions on their dm_control task
+    object, but pulling them requires loading the full env per task
+    (~seconds each). The CI smoke-eval already captures the instruction
+    inside its episode info; this mapping is just enough to key
+    `metrics.json` by `<task>_0`.
+    """
+    out: dict[str, str] = {}
+    for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+        out[f"{task}_0"] = task.replace("_", " ").strip()
+    return out


 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
    parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
+    parser.add_argument(
+        "--task-ids",
+        type=str,
+        default=None,
+        help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]",
+    )
    parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
    args = parser.parse_args()

+    task_ids: list[int] | None = None
+    if args.task_ids:
+        task_ids = [int(x.strip()) for x in args.task_ids.split(",")]
+
    descriptions: dict[str, str] = {}
    try:
-        if args.env in {"libero", "libero_plus"}:
+        if args.env == ("libero", "libero_plus"):
            descriptions = _libero_descriptions(args.task)
        elif args.env == "metaworld":
            descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robotwin":
+            descriptions = _robotwin_descriptions(args.task)
+        elif args.env == "robocasa":
+            descriptions = _robocasa_descriptions(args.task)
        elif args.env == "robomme":
-            descriptions = _robomme_descriptions(args.task)
+            descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
+        elif args.env == "vlabench":
+            descriptions = _vlabench_descriptions(args.task)
        else:
            print(
                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
@@ -1,27 +0,0 @@
---
-title: LeRobot Benchmark Leaderboard
-emoji: 🤖
-colorFrom: yellow
-colorTo: orange
-sdk: gradio
-sdk_version: 5.29.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Benchmark history for LeRobot policy x benchmark runs
---
-
-# LeRobot Benchmark Leaderboard
-
-This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
-
- Latest result per policy and benchmark
- Historical trends over time
- Direct links to uploaded eval and config artifacts
-
-## Configuration
-
-Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
-at a different public dataset. The default is:
-
- `lerobot/benchmark-history`
@@ -1,226 +0,0 @@
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import json
-import os
-import time
-from pathlib import Path
-from typing import Any
-
-import gradio as gr
-import pandas as pd
-import plotly.express as px
-from huggingface_hub import HfApi, hf_hub_download
-
-RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
-CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-CACHE_TTL_S = 300
-
-_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
-
-
-def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
-    overall = row.get("eval", {}).get("overall", {})
-    resources = row.get("resources", {})
-    timings = row.get("timings", {})
-    artifact_urls = row.get("artifact_urls", {})
-    return {
-        "created_at": row.get("created_at"),
-        "benchmark": row.get("benchmark"),
-        "policy": row.get("policy"),
-        "success_rate": overall.get("pc_success"),
-        "n_episodes": overall.get("n_episodes"),
-        "avg_sum_reward": overall.get("avg_sum_reward"),
-        "train_wall_time_s": timings.get("train_wall_time_s"),
-        "eval_wall_time_s": timings.get("eval_wall_time_s"),
-        "total_wall_time_s": timings.get("total_wall_time_s"),
-        "num_gpus": resources.get("num_gpus"),
-        "microbatch_per_gpu": resources.get("microbatch_per_gpu"),
-        "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
-        "effective_batch_size": resources.get("effective_batch_size"),
-        "git_commit": row.get("git_commit"),
-        "row_url": artifact_urls.get("row"),
-        "eval_info_url": artifact_urls.get("eval_info"),
-        "train_config_url": artifact_urls.get("train_config"),
-    }
-
-
-def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
-    cache_key = f"rows::{repo_id}"
-    cached = _CACHE.get(cache_key)
-    if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
-        return cached[1]
-
-    api = HfApi()
-    files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
-    records: list[dict[str, Any]] = []
-    for path_in_repo in sorted(files, reverse=True):
-        local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
-        with open(local_path) as f:
-            row = json.load(f)
-        records.append(_row_to_record(row))
-
-    df = pd.DataFrame.from_records(records)
-    if not df.empty:
-        df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
-        df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
-    _CACHE[cache_key] = (time.monotonic(), df)
-    return df
-
-
-def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
-    if df.empty:
-        return df
-    latest = (
-        df.sort_values("created_at", ascending=False)
-        .groupby(["benchmark", "policy"], as_index=False)
-        .first()
-        .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
-    )
-    return latest[
-        [
-            "benchmark",
-            "policy",
-            "success_rate",
-            "n_episodes",
-            "train_wall_time_s",
-            "eval_wall_time_s",
-            "num_gpus",
-            "effective_batch_size",
-            "git_commit",
-            "row_url",
-            "eval_info_url",
-            "train_config_url",
-        ]
-    ]
-
-
-def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
-    filtered = df[df["benchmark"] == benchmark]
-    if policy and policy != "All":
-        filtered = filtered[filtered["policy"] == policy]
-    if filtered.empty:
-        return px.line(title="No benchmark rows found")
-    fig = px.line(
-        filtered.sort_values("created_at"),
-        x="created_at",
-        y="success_rate",
-        color="policy",
-        markers=True,
-        hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
-        title=f"{benchmark} success rate history",
-    )
-    fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
-    return fig
-
-
-def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
-    filtered = df[df["benchmark"] == benchmark]
-    if policy and policy != "All":
-        filtered = filtered[filtered["policy"] == policy]
-    if filtered.empty:
-        return "No matching runs yet."
-    latest = filtered.sort_values("created_at", ascending=False).iloc[0]
-    row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
-    eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
-    train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
-    lines = [
-        f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
-        f"Success rate: `{latest['success_rate']}`",
-        f"GPUs: `{latest['num_gpus']}`",
-        f"Effective batch size: `{latest['effective_batch_size']}`",
-        f"Commit: `{latest['git_commit']}`",
-    ]
-    if row_link:
-        lines.append(f"Row JSON: [open]({row_link})")
-    if eval_link:
-        lines.append(f"Eval Info: [open]({eval_link})")
-    if train_link:
-        lines.append(f"Train Config: [open]({train_link})")
-    return "\n\n".join(lines)
-
-
-def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
-    df = load_rows()
-    latest_table = make_latest_table(df)
-    benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
-    if benchmark not in benchmark_names and benchmark_names:
-        benchmark = benchmark_names[0]
-    policy_choices = ["All"]
-    if benchmark and not df.empty:
-        policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
-    if policy not in policy_choices:
-        policy = "All"
-    history = make_history_figure(df, benchmark, policy)
-    summary = make_run_markdown(df, benchmark, policy)
-    return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
-
-
-with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
-    gr.Markdown(
-        f"""
-# LeRobot Benchmark Leaderboard
-
-Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
-"""
-    )
-
-    with gr.Row():
-        benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
-        policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
-        refresh_button = gr.Button("Refresh")
-
-    latest_table = gr.Dataframe(label="Latest Results", interactive=False)
-    history_plot = gr.Plot(label="History")
-    latest_summary = gr.Markdown()
-
-    def _initial_state():
-        df = load_rows()
-        benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
-        benchmark = benchmarks[0] if benchmarks else ""
-        latest, policy_choices, history, summary = refresh_view(benchmark, "All")
-        return (
-            gr.update(choices=benchmarks, value=benchmark),
-            policy_choices,
-            latest,
-            history,
-            summary,
-        )
-
-    demo.load(
-        _initial_state,
-        outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
-    )
-    refresh_button.click(
-        refresh_view,
-        inputs=[benchmark_dropdown, policy_dropdown],
-        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
-    )
-    benchmark_dropdown.change(
-        refresh_view,
-        inputs=[benchmark_dropdown, policy_dropdown],
-        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
-    )
-    policy_dropdown.change(
-        refresh_view,
-        inputs=[benchmark_dropdown, policy_dropdown],
-        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
-    )
-
-
-if __name__ == "__main__":
-    demo.launch()
@@ -1,4 +0,0 @@
-gradio>=5.0.0,<6.0.0
-plotly>=5.18.0
-pandas>=2.0.0
-huggingface-hub>=1.0.0,<2.0.0
@@ -33,7 +33,7 @@ import cv2  # type: ignore  # TODO: add type stubs for OpenCV
 import numpy as np  # type: ignore  # TODO: add type stubs for numpy

 from lerobot.utils.decorators import check_if_not_connected
-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 if TYPE_CHECKING or _reachy2_sdk_available:
    from reachy2_sdk.media.camera import CameraView
@@ -76,6 +76,7 @@ class Reachy2Camera(Camera):
        Args:
            config: The configuration settings for the camera.
        """
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -17,18 +17,21 @@ Provides the RealSenseCamera class for capturing frames from Intel RealSense cam
 """

 import logging
+import sys
 import time
 from threading import Event, Lock, Thread
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import cv2  # type: ignore  # TODO: add type stubs for OpenCV
 import numpy as np  # type: ignore  # TODO: add type stubs for numpy
 from numpy.typing import NDArray  # type: ignore  # TODO: add type stubs for numpy.typing

-try:
-    import pyrealsense2 as rs  # type: ignore  # TODO: add type stubs for pyrealsense2
-except Exception as e:
-    logging.info(f"Could not import realsense: {e}")
+from lerobot.utils.import_utils import _pyrealsense2_available, require_package
+
+if TYPE_CHECKING or _pyrealsense2_available:
+    import pyrealsense2 as rs
+else:
+    rs = None

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
 from lerobot.utils.errors import DeviceNotConnectedError
@@ -39,6 +42,7 @@ from ..utils import get_cv2_rotation
 from .configuration_realsense import RealSenseCameraConfig

 logger = logging.getLogger(__name__)
+pkg_name = "pyrealsense2-macosx" if sys.platform == "darwin" else "pyrealsense2"


 class RealSenseCamera(Camera):
@@ -112,7 +116,7 @@ class RealSenseCamera(Camera):
        Args:
            config: The configuration settings for the camera.
        """
-
+        require_package(pkg_name, extra="intelrealsense", import_name="pyrealsense2")
        super().__init__(config)

        self.config = config
@@ -28,12 +28,19 @@ import json
 import logging
 import time
 from threading import Event, Lock, Thread
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import cv2
 import numpy as np
 from numpy.typing import NDArray

+from lerobot.utils.import_utils import _zmq_available, require_package
+
+if TYPE_CHECKING or _zmq_available:
+    import zmq
+else:
+    zmq = None
+
 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
 from lerobot.utils.errors import DeviceNotConnectedError

@@ -74,8 +81,8 @@ class ZMQCamera(Camera):
    """

    def __init__(self, config: ZMQCameraConfig):
+        require_package("pyzmq", extra="pyzmq-dep", import_name="zmq")
        super().__init__(config)
-        import zmq

        self.config = config
        self.server_address = config.server_address
@@ -117,8 +124,6 @@ class ZMQCamera(Camera):
        logger.info(f"Connecting to {self}...")

        try:
-            import zmq
-
            self.context = zmq.Context()
            self.socket = self.context.socket(zmq.SUB)
            self.socket.setsockopt_string(zmq.SUBSCRIBE, "")
@@ -180,11 +185,8 @@ class ZMQCamera(Camera):

        try:
            message = self.socket.recv_string()
-        except Exception as e:
-            # zmq is lazy-imported in connect(), so check by name to avoid a top-level import
-            if type(e).__name__ == "Again":
-                raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e
-            raise
+        except zmq.Again as e:
+            raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e

        # Decode JSON message
        data = json.loads(message)
@@ -28,6 +28,12 @@ import numpy as np
 import torch

 from lerobot.policies import PreTrainedPolicy, prepare_observation_for_inference
+from lerobot.utils.import_utils import _deepdiff_available, require_package
+
+if TYPE_CHECKING or _deepdiff_available:
+    from deepdiff import DeepDiff
+else:
+    DeepDiff = None

 if TYPE_CHECKING:
    from lerobot.datasets import LeRobotDataset
@@ -217,10 +223,7 @@ def sanity_check_dataset_robot_compatibility(
    Raises:
        ValueError: If any of the checked metadata fields do not match.
    """
-    from lerobot.utils.import_utils import require_package
-
-    require_package("deepdiff", extra="hardware")
-    from deepdiff import DeepDiff
+    require_package("deepdiff", extra="deepdiff-dep")

    from lerobot.utils.constants import DEFAULT_FEATURES

@@ -35,6 +35,9 @@ class DatasetConfig:
    revision: str | None = None
    use_imagenet_stats: bool = True
    video_backend: str = field(default_factory=get_safe_default_codec)
+    # When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
+    # This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
+    return_uint8: bool = False
    streaming: bool = False

    def __post_init__(self) -> None:
@@ -67,17 +70,11 @@ class EvalConfig:
    # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
    # Set to 0 for auto-tuning based on available CPU cores and n_episodes.
    batch_size: int = 0
-    # Number of rollout videos to save per evaluated task. Set to 0 to disable videos.
-    max_episodes_rendered: int = 10
    # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
    # Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1.
    use_async_envs: bool = True

    def __post_init__(self) -> None:
-        if self.max_episodes_rendered < 0:
-            raise ValueError(
-                f"`max_episodes_rendered` must be non-negative, got {self.max_episodes_rendered}."
-            )
        if self.batch_size == 0:
            self.batch_size = self._auto_batch_size()
        if self.batch_size > self.n_episodes:
@@ -56,7 +56,8 @@ class TrainPipelineConfig(HubMixin):
    # Number of workers for the dataloader.
    num_workers: int = 4
    batch_size: int = 8
-    gradient_accumulation_steps: int = 1
+    prefetch_factor: int = 4
+    persistent_workers: bool = True
    steps: int = 100_000
    eval_freq: int = 20_000
    log_freq: int = 200
@@ -133,11 +134,6 @@ class TrainPipelineConfig(HubMixin):
        if isinstance(self.dataset.repo_id, list):
            raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")

-        if self.gradient_accumulation_steps <= 0:
-            raise ValueError(
-                f"`gradient_accumulation_steps` must be strictly positive, got {self.gradient_accumulation_steps}."
-            )
-
        if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
            raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
        elif self.use_policy_training_preset and not self.resume:
@@ -16,6 +16,7 @@
 """Private reader component for LeRobotDataset. Handles random-access reading (HF dataset, delta indices, video decoding)."""

 from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

 import datasets
@@ -49,6 +50,7 @@ class DatasetReader:
        video_backend: str,
        delta_timestamps: dict[str, list[float]] | None,
        image_transforms: Callable | None,
+        return_uint8: bool = False,
    ):
        """Initialize the reader with metadata, filtering, and transform config.

@@ -73,6 +75,7 @@ class DatasetReader:
        self._tolerance_s = tolerance_s
        self._video_backend = video_backend
        self._image_transforms = image_transforms
+        self._return_uint8 = return_uint8

        self.hf_dataset: datasets.Dataset | None = None
        self._absolute_to_relative_idx: dict[int, int] | None = None
@@ -105,10 +108,8 @@ class DatasetReader:
        """Build absolute-to-relative index mapping from loaded hf_dataset."""
        self._absolute_to_relative_idx = None
        if self.episodes is not None and self.hf_dataset is not None:
-            self._absolute_to_relative_idx = {
-                abs_idx.item() if isinstance(abs_idx, torch.Tensor) else abs_idx: rel_idx
-                for rel_idx, abs_idx in enumerate(self.hf_dataset["index"])
-            }
+            indices = self.hf_dataset.data.column("index").to_numpy()
+            self._absolute_to_relative_idx = dict(zip(indices.tolist(), range(len(indices)), strict=True))

    @property
    def num_frames(self) -> int:
@@ -235,16 +236,30 @@ class DatasetReader:
        Segmentation Fault.
        """
        ep = self._meta.episodes[ep_idx]
-        item = {}
-        for vid_key, query_ts in query_timestamps.items():
+
+        def _decode_single(vid_key: str, query_ts: list[float]) -> tuple[str, torch.Tensor]:
            from_timestamp = ep[f"videos/{vid_key}/from_timestamp"]
            shifted_query_ts = [from_timestamp + ts for ts in query_ts]
-
            video_path = self.root / self._meta.get_video_file_path(ep_idx, vid_key)
-            frames = decode_video_frames(video_path, shifted_query_ts, self._tolerance_s, self._video_backend)
-            item[vid_key] = frames.squeeze(0)
+            frames = decode_video_frames(
+                video_path,
+                shifted_query_ts,
+                self._tolerance_s,
+                self._video_backend,
+                return_uint8=self._return_uint8,
+            )
+            return vid_key, frames.squeeze(0)

-        return item
+        items = list(query_timestamps.items())
+
+        # Single camera: no threading overhead
+        if len(items) <= 1:
+            return {vid_key: _decode_single(vid_key, query_ts)[1] for vid_key, query_ts in items}
+
+        # Multi-camera: decode in parallel (video decoding releases the GIL)
+        with ThreadPoolExecutor(max_workers=len(items)) as pool:
+            futures = [pool.submit(_decode_single, k, ts) for k, ts in items]
+            return dict(f.result() for f in futures)

    def get_item(self, idx) -> dict:
        """Core __getitem__ logic. Assumes hf_dataset is loaded.
@@ -597,7 +597,7 @@ class DatasetWriter:

    def cleanup_interrupted_episode(self, episode_index: int) -> None:
        """Remove temporary image directories for an interrupted episode."""
-        for key in self._meta.video_keys:
+        for key in self._meta.camera_keys:
            img_dir = self._get_image_file_path(
                episode_index=episode_index, image_key=key, frame_index=0
            ).parent
@@ -92,6 +92,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
                image_transforms=image_transforms,
                revision=cfg.dataset.revision,
                video_backend=cfg.dataset.video_backend,
+                return_uint8=True,
                tolerance_s=cfg.tolerance_s,
            )
        else:
@@ -104,6 +105,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
                revision=cfg.dataset.revision,
                max_num_shards=cfg.num_workers,
                tolerance_s=cfg.tolerance_s,
+                return_uint8=True,
            )
    else:
        raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
@@ -30,13 +30,13 @@ def safe_stop_image_writer(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
-        except Exception as e:
+        except BaseException:
            dataset = kwargs.get("dataset")
            writer = getattr(dataset, "writer", None) if dataset else None
            if writer is not None and writer.image_writer is not None:
                logger.warning("Waiting for image writer to terminate...")
                writer.image_writer.stop()
-            raise e
+            raise

    return wrapper

@@ -56,6 +56,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        force_cache_sync: bool = False,
        download_videos: bool = True,
        video_backend: str | None = None,
+        return_uint8: bool = False,
        batch_encoding_size: int = 1,
        vcodec: str = "libsvtav1",
        streaming_encoding: bool = False,
@@ -202,6 +203,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.tolerance_s = tolerance_s
        self.revision = revision if revision else CODEBASE_VERSION
        self._video_backend = video_backend if video_backend else get_safe_default_codec()
+        self._return_uint8 = return_uint8
        self._batch_encoding_size = batch_encoding_size
        self._vcodec = resolve_vcodec(vcodec)
        self._encoder_threads = encoder_threads
@@ -225,6 +227,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            video_backend=self._video_backend,
            delta_timestamps=delta_timestamps,
            image_transforms=image_transforms,
+            return_uint8=self._return_uint8,
        )

        # Load actual data
@@ -288,6 +291,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                video_backend=self._video_backend,
                delta_timestamps=self.delta_timestamps,
                image_transforms=self.image_transforms,
+                return_uint8=self._return_uint8,
            )
        return self.reader

@@ -683,6 +687,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.delta_timestamps = None
        obj.episodes = None
        obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
+        obj._return_uint8 = False
        obj._batch_encoding_size = batch_encoding_size
        obj._vcodec = vcodec
        obj._encoder_threads = encoder_threads
@@ -775,6 +780,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.delta_timestamps = None
        obj.episodes = None
        obj._video_backend = video_backend if video_backend else get_safe_default_codec()
+        obj._return_uint8 = False
        obj._batch_encoding_size = batch_encoding_size
        obj._vcodec = vcodec
        obj._encoder_threads = encoder_threads
@@ -251,6 +251,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
        seed: int = 42,
        rng: np.random.Generator | None = None,
        shuffle: bool = True,
+        return_uint8: bool = False,
    ):
        """Initialize a StreamingLeRobotDataset.

@@ -288,6 +289,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):

        self.streaming = streaming
        self.buffer_size = buffer_size
+        self._return_uint8 = return_uint8

        # We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown)
        self.video_decoder_cache = None
@@ -553,7 +555,11 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
            root = self.meta.url_root if self.streaming and not self.streaming_from_local else self.root
            video_path = f"{root}/{self.meta.get_video_file_path(ep_idx, video_key)}"
            frames = decode_video_frames_torchcodec(
-                video_path, query_ts, self.tolerance_s, decoder_cache=self.video_decoder_cache
+                video_path,
+                query_ts,
+                self.tolerance_s,
+                decoder_cache=self.video_decoder_cache,
+                return_uint8=self._return_uint8,
            )

            item[video_key] = frames.squeeze(0) if len(query_ts) == 1 else frames
@@ -123,6 +123,7 @@ def decode_video_frames(
    timestamps: list[float],
    tolerance_s: float,
    backend: str | None = None,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """
    Decodes video frames using the specified backend.
@@ -131,19 +132,23 @@ def decode_video_frames(
        video_path (Path): Path to the video file.
        timestamps (list[float]): List of timestamps to extract frames.
        tolerance_s (float): Allowed deviation in seconds for frame retrieval.
-        backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav"..
+        backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav".
+        return_uint8 (bool): If True, return raw uint8 frames without float32 normalization.
+            This reduces memory for DataLoader IPC; normalization can be done on GPU afterward.

    Returns:
-        torch.Tensor: Decoded frames.
+        torch.Tensor: Decoded frames (float32 in [0,1] by default, or uint8 if return_uint8=True).

    Currently supports torchcodec on cpu and pyav.
    """
    if backend is None:
        backend = get_safe_default_codec()
    if backend == "torchcodec":
-        return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s)
+        return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8)
    elif backend in ["pyav", "video_reader"]:
-        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+        return decode_video_frames_torchvision(
+            video_path, timestamps, tolerance_s, backend, return_uint8=return_uint8
+        )
    else:
        raise ValueError(f"Unsupported video backend: {backend}")

@@ -154,6 +159,7 @@ def decode_video_frames_torchvision(
    tolerance_s: float,
    backend: str = "pyav",
    log_loaded_timestamps: bool = False,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """Loads frames associated to the requested timestamps of a video

@@ -240,14 +246,17 @@ def decode_video_frames_torchvision(
    if log_loaded_timestamps:
        logger.info(f"{closest_ts=}")

-    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
-    closest_frames = closest_frames.type(torch.float32) / 255
-
    if len(timestamps) != len(closest_frames):
        raise FrameTimestampError(
            f"Number of retrieved frames ({len(closest_frames)}) does not match "
            f"number of queried timestamps ({len(timestamps)})"
        )
+
+    if return_uint8:
+        return closest_frames
+
+    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
+    closest_frames = closest_frames.type(torch.float32) / 255
    return closest_frames


@@ -306,6 +315,7 @@ def decode_video_frames_torchcodec(
    tolerance_s: float,
    log_loaded_timestamps: bool = False,
    decoder_cache: VideoDecoderCache | None = None,
+    return_uint8: bool = False,
 ) -> torch.Tensor:
    """Loads frames associated with the requested timestamps of a video using torchcodec.

@@ -373,14 +383,16 @@ def decode_video_frames_torchcodec(
    if log_loaded_timestamps:
        logger.info(f"{closest_ts=}")

-    # convert to float32 in [0,1] range
-    closest_frames = (closest_frames / 255.0).type(torch.float32)
-
    if not len(timestamps) == len(closest_frames):
        raise FrameTimestampError(
            f"Retrieved timestamps differ from queried {set(closest_frames) - set(timestamps)}"
        )

+    if return_uint8:
+        return closest_frames
+
+    # convert to float32 in [0,1] range
+    closest_frames = (closest_frames / 255.0).type(torch.float32)
    return closest_frames


@@ -18,15 +18,7 @@
 # from lerobot.utils.import_utils import require_package
 # require_package("gymnasium", extra="<update_extra>", import_name="gymnasium")

-from .configs import (
-    AlohaEnv,
-    EnvConfig,
-    HILSerlRobotEnvConfig,
-    HubEnvConfig,
-    LiberoPlusEnv,
-    PushtEnv,
-    RoboMMEEnv,
-)
+from .configs import AlohaEnv, EnvConfig, HILSerlRobotEnvConfig, HubEnvConfig, PushtEnv
 from .factory import make_env, make_env_config, make_env_pre_post_processors
 from .utils import check_env_attributes_and_types, close_envs, env_to_policy_features, preprocess_observation

@@ -35,9 +27,7 @@ __all__ = [
    "EnvConfig",
    "HILSerlRobotEnvConfig",
    "HubEnvConfig",
-    "LiberoPlusEnv",
    "PushtEnv",
-    "RoboMMEEnv",
    "check_env_attributes_and_types",
    "close_envs",
    "env_to_policy_features",
@@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig):
    camera_name_mapping: dict[str, str] | None = None
    observation_height: int = 360
    observation_width: int = 360
+    is_libero_plus: bool = False
    features: dict[str, PolicyFeature] = field(
        default_factory=lambda: {
            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
@@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig):
            control_mode=self.control_mode,
            episode_length=self.episode_length,
            camera_name_mapping=self.camera_name_mapping,
+            is_libero_plus=self.is_libero_plus,
        )

    def get_env_processors(self):
@@ -496,6 +498,146 @@ class MetaworldEnv(EnvConfig):
        )


+@EnvConfig.register_subclass("robocasa")
+@dataclass
+class RoboCasaEnv(EnvConfig):
+    task: str = "CloseFridge"
+    fps: int = 20
+    episode_length: int = 1000
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
+    observation_height: int = 256
+    observation_width: int = 256
+    visualization_height: int = 512
+    visualization_width: int = 512
+    split: str | None = None
+    # Object-mesh registries to sample from. Upstream default is
+    # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
+    # only ships the lightwheel pack. Override to include objaverse once
+    # you've run `python -m robocasa.scripts.download_kitchen_assets
+    # --type objaverse` locally.
+    obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
+    )
+    features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
+
+    def __post_init__(self):
+        if self.obs_type not in ("pixels", "pixels_agent_pos"):
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        # Preserve raw RoboCasa camera names end-to-end (e.g.
+        # `observation.images.robot0_agentview_left`). This matches the
+        # naming convention used by the RoboCasa datasets on the Hub, so
+        # trained policies don't need a `--rename_map` at eval time.
+        cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
+        for cam in cams:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
+
+    @property
+    def gym_kwargs(self) -> dict:
+        kwargs: dict[str, Any] = {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "observation_height": self.observation_height,
+            "observation_width": self.observation_width,
+            "visualization_height": self.visualization_height,
+            "visualization_width": self.visualization_width,
+        }
+        if self.split is not None:
+            kwargs["split"] = self.split
+        return kwargs
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = False):
+        from .robocasa import create_robocasa_envs
+
+        if self.task is None:
+            raise ValueError("RoboCasaEnv requires a task to be specified")
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robocasa_envs(
+            task=self.task,
+            n_envs=n_envs,
+            camera_name=self.camera_name,
+            gym_kwargs=self.gym_kwargs,
+            env_cls=env_cls,
+            episode_length=self.episode_length,
+            obj_registries=tuple(self.obj_registries),
+        )
+
+
+@EnvConfig.register_subclass("vlabench")
+@dataclass
+class VLABenchEnv(EnvConfig):
+    task: str = "select_fruit"
+    fps: int = 10
+    episode_length: int = 500
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    render_resolution: tuple[int, int] = (480, 480)
+    robot: str = "franka"
+    action_mode: str = "eef"
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "agent_pos": OBS_STATE,
+            "pixels/image": f"{OBS_IMAGES}.image",
+            "pixels/second_image": f"{OBS_IMAGES}.second_image",
+            "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
+        }
+    )
+
+    def __post_init__(self):
+        h, w = self.render_resolution
+        if self.obs_type == "pixels":
+            self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+        elif self.obs_type == "pixels_agent_pos":
+            self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,))
+        else:
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "render_resolution": self.render_resolution,
+            "robot": self.robot,
+            "max_episode_steps": self.episode_length,
+            "action_mode": self.action_mode,
+        }
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = False):
+        from .vlabench import create_vlabench_envs
+
+        if self.task is None:
+            raise ValueError("VLABenchEnv requires a task to be specified")
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_vlabench_envs(
+            task=self.task,
+            n_envs=n_envs,
+            gym_kwargs=self.gym_kwargs,
+            env_cls=env_cls,
+        )
+
+
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
 class IsaaclabArenaEnv(HubEnvConfig):
@@ -579,45 +721,158 @@ class IsaaclabArenaEnv(HubEnvConfig):
@EnvConfig.register_subclass("libero_plus")
@dataclass
 class LiberoPlusEnv(LiberoEnv):
-    """Config for LIBERO-plus robustness benchmark evaluation."""
+    """Config for LIBERO-plus robustness benchmark evaluation.
+
+    LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints,
+    object layouts, robot initial states, language instructions, lighting, background
+    textures, sensor noise) producing ~10k task variants.
+
+    The gym interface is identical to LIBERO so this class reuses ``LiberoEnv``
+    entirely — only the registered name and default task suite differ.
+
+    Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships
+    as a namespace package from a git fork and must be cloned + PYTHONPATH'd
+    rather than installed as a pyproject extra.
+
+    See Also:
+        https://github.com/sylvestf/LIBERO-plus
+    """

    task: str = "libero_spatial"
+    is_libero_plus: bool = True


-@EnvConfig.register_subclass("robomme")
+@EnvConfig.register_subclass("robotwin")
@dataclass
-class RoboMMEEnv(EnvConfig):
-    """RoboMME memory-augmented manipulation benchmark."""
+class RoboTwinEnvConfig(EnvConfig):
+    """Configuration for RoboTwin 2.0 benchmark environments.

-    task: str = "PickXtimes"
-    fps: int = 10
+    RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
+    SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
+    (7 per arm). All three cameras are enabled by default.
+
+    See: https://robotwin-platform.github.io
+    Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
+    """
+
+    task: str = "beat_block_hammer"  # single task or comma-separated list
+    fps: int = 25
    episode_length: int = 300
-    action_space: str = "joint_angle"
-    dataset_split: str = "test"
-    task_ids: list[int] | None = None
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
+    # (torso-mounted) + left_camera / right_camera (wrists).
+    camera_names: str = "head_camera,left_camera,right_camera"
+    # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
+    # Gym's vector-env concatenate pre-allocates buffers of this shape, so it
+    # must equal what SAPIEN actually renders.
+    observation_height: int = 240
+    observation_width: int = 320
    features: dict[str, PolicyFeature] = field(
        default_factory=lambda: {
-            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)),
-            "image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
-            "wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
-            OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)),
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
        }
    )
    features_map: dict[str, str] = field(
        default_factory=lambda: {
            ACTION: ACTION,
-            "image": f"{OBS_IMAGES}.image",
-            "wrist_image": f"{OBS_IMAGES}.wrist_image",
-            OBS_STATE: OBS_STATE,
+            "pixels/head_camera": f"{OBS_IMAGES}.head_camera",
+            "pixels/left_camera": f"{OBS_IMAGES}.left_camera",
+            "pixels/right_camera": f"{OBS_IMAGES}.right_camera",
+            "agent_pos": OBS_STATE,
        }
    )

+    def __post_init__(self):
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        for cam in cam_list:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            # Keep features_map entry if already set (default_factory); add if missing.
+            key = f"pixels/{cam}"
+            if key not in self.features_map:
+                self.features_map[key] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(14,),  # 14 DOF: 7 per arm
+            )
+        elif self.obs_type != "pixels":
+            raise ValueError(
+                f"Unsupported obs_type '{self.obs_type}'. "
+                "RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
+            )
+
    @property
    def gym_kwargs(self) -> dict:
        return {}

    def create_envs(self, n_envs: int, use_async_envs: bool = True):
-        from .robomme import create_robomme_envs
+        from lerobot.envs.robotwin import create_robotwin_envs
+
+        if not self.task:
+            raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        return create_robotwin_envs(
+            task=self.task,
+            n_envs=n_envs,
+            env_cls=env_cls,
+            camera_names=cam_list,
+            observation_height=self.observation_height,
+            observation_width=self.observation_width,
+            episode_length=self.episode_length,
+        )
+
+
+@EnvConfig.register_subclass("robomme")
+@dataclass
+class RoboMMEEnv(EnvConfig):
+    """RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
+
+    16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
+    Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes).
+    Benchmark: https://github.com/RoboMME/robomme_benchmark
+
+    Requires the `robomme` git package installed separately (Linux only);
+    see docker/Dockerfile.benchmark.robomme for the canonical install.
+    """
+
+    task: str = "PickXtimes"
+    fps: int = 10
+    episode_length: int = 300
+    action_space: str = "joint_angle"  # or "ee_pose" (7-D)
+    dataset_split: str = "test"  # "train" | "val" | "test"
+    task_ids: list[int] | None = None
+    features: dict[str, PolicyFeature] = field(default_factory=dict)
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "pixels/image": f"{OBS_IMAGES}.image",
+            "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
+            "agent_pos": OBS_STATE,
+        }
+    )
+
+    def __post_init__(self):
+        action_dim = 8 if self.action_space == "joint_angle" else 7
+        self.features = {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)),
+            "pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)),
+        }
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from lerobot.envs.robomme import create_robomme_envs

        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
        return create_robomme_envs(
@@ -32,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv

 from lerobot.types import RobotObservation

-from .utils import _LazyAsyncVectorEnv
-
-
-def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
-    """Normalize camera_name into a non-empty list of strings."""
-    if isinstance(camera_name, str):
-        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
-    elif isinstance(camera_name, (list | tuple)):
-        cams = [str(c).strip() for c in camera_name if str(c).strip()]
-    else:
-        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
-    if not cams:
-        raise ValueError("camera_name resolved to an empty list.")
-    return cams
+from .utils import _LazyAsyncVectorEnv, parse_camera_names


 def _get_suite(name: str) -> benchmark.Benchmark:
@@ -77,18 +64,24 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
 _LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")


-def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
+def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray:
    task = task_suite.tasks[i]
    filename = Path(task.init_states_file)
    root = Path(get_libero_path("init_states"))

-    # `_add_` / `_level` variants store extra-object layouts under libero_newobj/
-    # as a flat array that must be reshaped to (1, -1).
+    if not is_libero_plus:
+        init_states_path = root / task.problem_folder / filename.name
+        return torch.load(init_states_path, weights_only=False)  # nosec B614
+
+    # LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under
+    # libero_newobj/ as a flat array that must be reshaped to (1, -1).
    if "_add_" in filename.name or "_level" in filename.name:
        init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
        init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
        return init_states.reshape(1, -1)

+    # LIBERO-plus perturbation variants encode the perturbation in the filename
+    # but on disk only the base `.pruned_init` exists — strip the suffix to match.
    stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
    init_states_path = root / task.problem_folder / stripped
    return torch.load(init_states_path, weights_only=False)  # nosec B614
@@ -133,9 +126,11 @@ class LiberoEnv(gym.Env):
        camera_name_mapping: dict[str, str] | None = None,
        num_steps_wait: int = 10,
        control_mode: str = "relative",
+        is_libero_plus: bool = False,
    ):
        super().__init__()
        self.task_id = task_id
+        self.is_libero_plus = is_libero_plus
        self.obs_type = obs_type
        self.render_mode = render_mode
        self.observation_width = observation_width
@@ -143,7 +138,7 @@ class LiberoEnv(gym.Env):
        self.visualization_width = visualization_width
        self.visualization_height = visualization_height
        self.init_states = init_states
-        self.camera_name = _parse_camera_names(
+        self.camera_name = parse_camera_names(
            camera_name
        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)

@@ -162,7 +157,11 @@ class LiberoEnv(gym.Env):
        self.episode_index = episode_index
        self.episode_length = episode_length
        # Load once and keep
-        self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
+        self._init_states = (
+            get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus)
+            if self.init_states
+            else None
+        )
        self._reset_stride = n_envs  # when performing a reset, append `_reset_stride` to `init_state_id`.

        self.init_state_id = self.episode_index  # tie each sub-env to a fixed init state
@@ -395,6 +394,7 @@ def _make_env_fns(
    gym_kwargs: Mapping[str, Any],
    control_mode: str,
    camera_name_mapping: dict[str, str] | None = None,
+    is_libero_plus: bool = False,
 ) -> list[Callable[[], LiberoEnv]]:
    """Build n_envs factory callables for a single (suite, task_id)."""

@@ -411,6 +411,7 @@ def _make_env_fns(
            n_envs=n_envs,
            control_mode=control_mode,
            camera_name_mapping=camera_name_mapping,
+            is_libero_plus=is_libero_plus,
            **local_kwargs,
        )

@@ -433,6 +434,7 @@ def create_libero_envs(
    control_mode: str = "relative",
    episode_length: int | None = None,
    camera_name_mapping: dict[str, str] | None = None,
+    is_libero_plus: bool = False,
 ) -> dict[str, dict[int, Any]]:
    """
    Create vectorized LIBERO environments with a consistent return shape.
@@ -452,7 +454,7 @@ def create_libero_envs(
    gym_kwargs = dict(gym_kwargs or {})
    task_ids_filter = gym_kwargs.pop("task_ids", None)  # optional: limit to specific tasks

-    camera_names = _parse_camera_names(camera_name)
+    camera_names = parse_camera_names(camera_name)
    suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
    if not suite_names:
        raise ValueError("`task` must contain at least one LIBERO suite name.")
@@ -477,6 +479,7 @@ def create_libero_envs(
        # Probe once and reuse to avoid creating a temp env per task.
        cached_obs_space: spaces.Space | None = None
        cached_act_space: spaces.Space | None = None
+        cached_metadata: dict[str, Any] | None = None

        for tid in selected:
            fns = _make_env_fns(
@@ -490,12 +493,14 @@ def create_libero_envs(
                gym_kwargs=gym_kwargs,
                control_mode=control_mode,
                camera_name_mapping=camera_name_mapping,
+                is_libero_plus=is_libero_plus,
            )
            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[suite_name][tid] = lazy
            else:
                out[suite_name][tid] = env_cls(fns)
@@ -311,6 +311,7 @@ def create_metaworld_envs(
    is_async = env_cls is gym.vector.AsyncVectorEnv
    cached_obs_space = None
    cached_act_space = None
+    cached_metadata = None
    out: dict[str, dict[int, Any]] = defaultdict(dict)

    for group in task_groups:
@@ -324,10 +325,11 @@ def create_metaworld_envs(
            fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]

            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[group][tid] = lazy
            else:
                out[group][tid] = env_cls(fns)
@@ -0,0 +1,425 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv, parse_camera_names
+
+logger = logging.getLogger(__name__)
+
+# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
+# These correspond to the PandaOmron robot in RoboCasa365.
+OBS_STATE_DIM = 16  # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
+ACTION_DIM = 12  # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+
+# Default PandaOmron cameras. We surface these raw names directly as
+# `observation.images.<name>` so the LeRobot dataset/policy keys match
+# RoboCasa's native convention (no implicit renaming).
+DEFAULT_CAMERAS = [
+    "robot0_agentview_left",
+    "robot0_eye_in_hand",
+    "robot0_agentview_right",
+]
+
+# Object-mesh registries to sample from. RoboCasa's upstream default is
+# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
+# most users — including our CI image — only download the lightwheel pack
+# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
+# category has zero candidates in every registry, robocasa crashes with
+# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
+# normalization). Restricting to registries that are actually on disk
+# avoids the NaN and matches what the asset download provides.
+DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
+
+# Task-group shortcuts accepted as `--env.task`. When the user passes one of
+# these names, we expand it to the upstream RoboCasa task list and auto-set
+# the dataset split. Individual task names (optionally comma-separated) still
+# take precedence; this only triggers on an exact group-name match.
+_TASK_GROUP_SPLITS = {
+    "atomic_seen": "target",
+    "composite_seen": "target",
+    "composite_unseen": "target",
+    "pretrain50": "pretrain",
+    "pretrain100": "pretrain",
+    "pretrain200": "pretrain",
+    "pretrain300": "pretrain",
+}
+
+
+def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
+    """Resolve a `--env.task` value to (task_names, split_override).
+
+    If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
+    expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
+    and return the matching split. Otherwise treat `task` as a single task or
+    comma-separated list and leave the split untouched (None).
+    """
+    key = task.strip()
+    if key in _TASK_GROUP_SPLITS:
+        from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
+
+        combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
+        if key not in combined:
+            raise ValueError(
+                f"Task group '{key}' is not available in this version of robocasa. "
+                f"Known groups: {sorted(combined.keys())}."
+            )
+        return list(combined[key]), _TASK_GROUP_SPLITS[key]
+
+    names = [t.strip() for t in task.split(",") if t.strip()]
+    if not names:
+        raise ValueError("`task` must contain at least one RoboCasa task name.")
+    return names, None
+
+
+def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
+    """Split a flat (12,) action vector into a RoboCasa action dict.
+
+    Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+    """
+    return {
+        "action.base_motion": flat_action[0:4],
+        "action.control_mode": flat_action[4:5],
+        "action.end_effector_position": flat_action[5:8],
+        "action.end_effector_rotation": flat_action[8:11],
+        "action.gripper_close": flat_action[11:12],
+    }
+
+
+class RoboCasaEnv(gym.Env):
+    """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
+
+    Wraps RoboCasaGymEnv from the robocasa package and converts its
+    dict-based observations and actions into the flat arrays LeRobot expects.
+    Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
+
+    def __init__(
+        self,
+        task: str,
+        camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+        obs_type: str = "pixels_agent_pos",
+        render_mode: str = "rgb_array",
+        observation_width: int = 256,
+        observation_height: int = 256,
+        visualization_width: int = 512,
+        visualization_height: int = 512,
+        split: str | None = None,
+        episode_length: int | None = None,
+        obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+        episode_index: int = 0,
+    ):
+        super().__init__()
+        self.task = task
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.observation_width = observation_width
+        self.observation_height = observation_height
+        self.visualization_width = visualization_width
+        self.visualization_height = visualization_height
+        self.split = split
+        self.obj_registries = tuple(obj_registries)
+        # Per-worker index (0..n_envs-1) used to spread the user-provided
+        # seed across factories so each sub-env explores a distinct layout
+        # even when the same seed is passed to `reset()`.
+        self.episode_index = int(episode_index)
+
+        self.camera_name = parse_camera_names(camera_name)
+
+        self._max_episode_steps = episode_length if episode_length is not None else 1000
+
+        # Deferred — created on first reset() inside the worker subprocess
+        # to avoid inheriting stale GPU/EGL contexts across fork().
+        self._env: Any = None
+        self.task_description = ""
+
+        images = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_name
+        }
+
+        if self.obs_type == "pixels":
+            self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                    "agent_pos": spaces.Box(
+                        low=-np.inf,
+                        high=np.inf,
+                        shape=(OBS_STATE_DIM,),
+                        dtype=np.float32,
+                    ),
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
+
+        self.action_space = spaces.Box(
+            low=ACTION_LOW,
+            high=ACTION_HIGH,
+            shape=(ACTION_DIM,),
+            dtype=np.float32,
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the underlying RoboCasaGymEnv on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own clean rendering context rather than inheriting a stale one from
+        the parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
+
+        # RoboCasaGymEnv defaults split="test", which create_env rejects
+        # (only None/"all"/"pretrain"/"target" are valid). Always pass a
+        # valid value so we don't hit that default. Extra kwargs are
+        # forwarded to the underlying kitchen env via create_env/robosuite.make.
+        self._env = RoboCasaGymEnv(
+            env_name=self.task,
+            camera_widths=self.observation_width,
+            camera_heights=self.observation_height,
+            split=self.split if self.split is not None else "all",
+            obj_registries=self.obj_registries,
+        )
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+    def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
+        """Convert RoboCasaGymEnv observation dict to LeRobot format."""
+        # RoboCasaGymEnv emits camera frames under "video.<cam>".
+        images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
+
+        if self.obs_type == "pixels":
+            return {"pixels": images}
+
+        # `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
+        agent_pos = np.concatenate(
+            [
+                raw_obs.get("state.base_position", np.zeros(3)),
+                raw_obs.get("state.base_rotation", np.zeros(4)),
+                raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
+                raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
+                raw_obs.get("state.gripper_qpos", np.zeros(2)),
+            ],
+            axis=-1,
+        ).astype(np.float32)
+
+        return {"pixels": images, "agent_pos": agent_pos}
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        assert self._env is not None
+        return self._env.render()
+
+    def reset(self, seed=None, **kwargs):
+        self._ensure_env()
+        assert self._env is not None
+        super().reset(seed=seed)
+        # Spread the seed across workers so n_envs factories don't all
+        # roll the same scene. With an explicit user seed we shift it by
+        # episode_index; with no seed we fall back to episode_index so
+        # each worker is still distinct rather than inheriting the same
+        # global RNG state.
+        worker_seed = seed + self.episode_index if seed is not None else self.episode_index
+        raw_obs, info = self._env.reset(seed=worker_seed)
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+        observation = self._format_raw_obs(raw_obs)
+        info = {"is_success": False}
+        return observation, info
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        self._ensure_env()
+        assert self._env is not None
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
+
+        action_dict = convert_action(action)
+        raw_obs, reward, done, truncated, info = self._env.step(action_dict)
+
+        is_success = bool(info.get("success", False))
+        terminated = done or is_success
+        info.update({"task": self.task, "done": done, "is_success": is_success})
+
+        observation = self._format_raw_obs(raw_obs)
+        if terminated:
+            info["final_info"] = {
+                "task": self.task,
+                "done": bool(done),
+                "is_success": bool(is_success),
+            }
+            self.reset()
+
+        return observation, reward, terminated, truncated, info
+
+    def close(self):
+        if self._env is not None:
+            self._env.close()
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    camera_names: list[str],
+    obs_type: str,
+    render_mode: str,
+    observation_width: int,
+    observation_height: int,
+    visualization_width: int,
+    visualization_height: int,
+    split: str | None,
+    episode_length: int | None,
+    obj_registries: Sequence[str],
+) -> list[Callable[[], RoboCasaEnv]]:
+    """Build n_envs factory callables for a single task.
+
+    Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
+    ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
+    user-provided seed.
+    """
+
+    def _make_env(episode_index: int) -> RoboCasaEnv:
+        return RoboCasaEnv(
+            task=task,
+            camera_name=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+            episode_index=episode_index,
+        )
+
+    return [partial(_make_env, i) for i in range(n_envs)]
+
+
+def create_robocasa_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] | None = None,
+    camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    episode_length: int | None = None,
+    obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboCasa365 environments with a consistent return shape.
+
+    Returns:
+        dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+    `task` can be:
+      - a single task name (e.g. `CloseFridge`)
+      - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
+      - a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
+        `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
+        `pretrain300`), which auto-expands to the upstream task list and
+        auto-sets the dataset `split` ("target" or "pretrain").
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    gym_kwargs = dict(gym_kwargs or {})
+    obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
+    render_mode = gym_kwargs.pop("render_mode", "rgb_array")
+    observation_width = gym_kwargs.pop("observation_width", 256)
+    observation_height = gym_kwargs.pop("observation_height", 256)
+    visualization_width = gym_kwargs.pop("visualization_width", 512)
+    visualization_height = gym_kwargs.pop("visualization_height", 512)
+    split = gym_kwargs.pop("split", None)
+
+    camera_names = parse_camera_names(camera_name)
+    task_names, group_split = _resolve_tasks(str(task))
+    if group_split is not None and split is None:
+        split = group_split
+
+    logger.info(
+        "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
+        task_names,
+        split,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task=task_name,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+        )
+
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {name: dict(task_map) for name, task_map in out.items()}
@@ -1,20 +1,18 @@
-#!/usr/bin/env python
+"""RoboMME environment wrapper for LeRobot evaluation.

-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
+``VectorEnv`` suitable for ``lerobot_eval``.

-"""RoboMME environment wrapper for LeRobot evaluation."""
+RoboMME tasks:
+  Counting:    BinFill, PickXtimes, SwingXtimes, StopCube
+  Permanence:  VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
+  Reference:   PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
+  Imitation:   MoveCube, InsertPeg, PatternLock, RouteStick
+
+Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes)
+Install: see docker/Dockerfile.benchmark.robomme  (Linux only — mani-skill vs numpy pin conflict)
+Benchmark: https://github.com/RoboMME/robomme_benchmark
+"""

 from __future__ import annotations

@@ -26,6 +24,8 @@ import gymnasium as gym
 import numpy as np
 from gymnasium import spaces

+from .utils import _LazyAsyncVectorEnv
+
 ROBOMME_TASKS = [
    "BinFill",
    "PickXtimes",
@@ -62,6 +62,13 @@ class RoboMMEGymEnv(gym.Env):
        super().__init__()
        from robomme.env_record_wrapper import BenchmarkEnvBuilder

+        self._task = task
+        self._action_space_type = action_space_type
+        self._dataset = dataset
+        self._episode_idx = episode_idx
+        self._max_steps = max_steps
+        self._max_episode_steps = max_steps
+
        self._builder = BenchmarkEnvBuilder(
            env_id=task,
            dataset=dataset,
@@ -69,19 +76,24 @@ class RoboMMEGymEnv(gym.Env):
            gui_render=False,
            max_steps=max_steps,
        )
-        self._max_episode_steps = max_steps
-        self._episode_idx = episode_idx
-        self._max_steps = max_steps
        self._env = None
        self._last_raw_obs: dict | None = None

        action_dim = 8 if action_space_type == "joint_angle" else 7
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
+        # `pixels` must be a nested Dict so `preprocess_observation()` in
+        # envs/utils.py picks it up and maps each camera to
+        # `observation.images.<cam>`. A flat layout (`pixels/image`,
+        # `pixels/wrist_image`) silently drops every image from the batch.
        self.observation_space = spaces.Dict(
            {
-                "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
-                "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
-                "state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
+                "pixels": spaces.Dict(
+                    {
+                        "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                        "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                    }
+                ),
+                "agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
            }
        )

@@ -103,12 +115,14 @@ class RoboMMEGymEnv(gym.Env):
        truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)

        status = info.get("status", "ongoing")
+        is_success = status == "success"
        conv_info = self._convert_info(info)
-        conv_info["is_success"] = status == "success"
+        conv_info["is_success"] = is_success

        return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info

    def render(self) -> np.ndarray | None:
+        """Return the front camera image from the last observation for video recording."""
        if self._last_raw_obs is None:
            return np.zeros((256, 256, 3), dtype=np.uint8)
        front = self._last_raw_obs.get("front_rgb_list")
@@ -135,14 +149,15 @@ class RoboMMEGymEnv(gym.Env):
            else obs["gripper_state_list"]
        )

+        front_rgb = np.asarray(front_rgb, dtype=np.uint8)
+        wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
        joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
        gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
        state = np.concatenate([joint, gripper])

        return {
-            "image": np.asarray(front_rgb, dtype=np.uint8),
-            "wrist_image": np.asarray(wrist_rgb, dtype=np.uint8),
-            "state": state,
+            "pixels": {"image": front_rgb, "wrist_image": wrist_rgb},
+            "agent_pos": state,
        }

    def _convert_info(self, info: dict) -> dict:
@@ -161,6 +176,8 @@ def _make_env_fns(
    episode_length: int,
    task_id: int,
 ) -> list[Callable[[], RoboMMEGymEnv]]:
+    """Build n_envs factory callables for one RoboMME task id."""
+
    def _make_one(episode_index: int) -> RoboMMEGymEnv:
        return RoboMMEGymEnv(
            task=task,
@@ -182,7 +199,14 @@ def create_robomme_envs(
    task_ids: list[int] | None = None,
    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
 ) -> dict[str, dict[int, gym.vector.VectorEnv]]:
-    """Create vectorized RoboMME environments for evaluation."""
+    """Create vectorized RoboMME environments for evaluation.
+
+    `task` may be a single RoboMME task name (e.g. "PickXtimes") or a
+    comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task
+    becomes its own suite in the returned mapping.
+
+    Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
+    """
    if env_cls is None or not callable(env_cls):
        raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
    if not isinstance(n_envs, int) or n_envs <= 0:
@@ -192,6 +216,10 @@ def create_robomme_envs(
        task_ids = [0]

    task_names = [t.strip() for t in task.split(",") if t.strip()]
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
    out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
    for task_name in task_names:
        envs_by_task: dict[int, gym.vector.VectorEnv] = {}
@@ -204,6 +232,14 @@ def create_robomme_envs(
                episode_length=episode_length,
                task_id=task_id,
            )
-            envs_by_task[task_id] = env_cls(fns)
+            if is_async:
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+                if cached_obs_space is None:
+                    cached_obs_space = lazy.observation_space
+                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
+                envs_by_task[task_id] = lazy
+            else:
+                envs_by_task[task_id] = env_cls(fns)
        out[task_name] = envs_by_task
    return out
@@ -0,0 +1,488 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import importlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
+# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
+ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
+    "head_camera",
+    "left_camera",
+    "right_camera",
+)
+
+ACTION_DIM = 14  # 7 DOF × 2 arms
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+DEFAULT_EPISODE_LENGTH = 300
+# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
+DEFAULT_CAMERA_H = 240
+DEFAULT_CAMERA_W = 320
+
+# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
+# (50 tasks as of main; earlier revisions had 60 with a different split).
+# Keep this in sync with:
+#   gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
+#     | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
+ROBOTWIN_TASKS: tuple[str, ...] = (
+    "adjust_bottle",
+    "beat_block_hammer",
+    "blocks_ranking_rgb",
+    "blocks_ranking_size",
+    "click_alarmclock",
+    "click_bell",
+    "dump_bin_bigbin",
+    "grab_roller",
+    "handover_block",
+    "handover_mic",
+    "hanging_mug",
+    "lift_pot",
+    "move_can_pot",
+    "move_pillbottle_pad",
+    "move_playingcard_away",
+    "move_stapler_pad",
+    "open_laptop",
+    "open_microwave",
+    "pick_diverse_bottles",
+    "pick_dual_bottles",
+    "place_a2b_left",
+    "place_a2b_right",
+    "place_bread_basket",
+    "place_bread_skillet",
+    "place_burger_fries",
+    "place_can_basket",
+    "place_cans_plasticbox",
+    "place_container_plate",
+    "place_dual_shoes",
+    "place_empty_cup",
+    "place_fan",
+    "place_mouse_pad",
+    "place_object_basket",
+    "place_object_scale",
+    "place_object_stand",
+    "place_phone_stand",
+    "place_shoe",
+    "press_stapler",
+    "put_bottles_dustbin",
+    "put_object_cabinet",
+    "rotate_qrcode",
+    "scan_object",
+    "shake_bottle",
+    "shake_bottle_horizontally",
+    "stack_blocks_three",
+    "stack_blocks_two",
+    "stack_bowls_three",
+    "stack_bowls_two",
+    "stamp_seal",
+    "turn_switch",
+)
+
+
+_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
+
+
+def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
+    """Build the kwargs dict RoboTwin's setup_demo expects.
+
+    Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
+    reads ``task_config/demo_clean.yml``, resolves the embodiment file from
+    ``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
+    reads camera dimensions from ``_camera_config.yml``.
+
+    Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
+    used by beat_block_hammer and most smoke-test tasks).
+    """
+    if task_name in _ROBOTWIN_SETUP_CACHE:
+        return dict(_ROBOTWIN_SETUP_CACHE[task_name])
+
+    import os
+
+    import yaml  # type: ignore[import-untyped]
+    from envs import CONFIGS_PATH  # type: ignore[import-not-found]
+
+    task_config = "demo_clean"
+    with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+
+    # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
+    with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
+        embodiment_types = yaml.safe_load(f)
+    embodiment = args.get("embodiment", ["aloha-agilex"])
+    if len(embodiment) == 1:
+        robot_file = embodiment_types[embodiment[0]]["file_path"]
+        args["left_robot_file"] = robot_file
+        args["right_robot_file"] = robot_file
+        args["dual_arm_embodied"] = True
+    elif len(embodiment) == 3:
+        args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
+        args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
+        args["embodiment_dis"] = embodiment[2]
+        args["dual_arm_embodied"] = False
+    else:
+        raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
+
+    with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["left_embodiment_config"] = yaml.safe_load(f)
+    with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["right_embodiment_config"] = yaml.safe_load(f)
+
+    # Camera dimensions
+    with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
+        camera_config = yaml.safe_load(f)
+    head_cam = args["camera"]["head_camera_type"]
+    args["head_camera_h"] = camera_config[head_cam]["h"]
+    args["head_camera_w"] = camera_config[head_cam]["w"]
+
+    # Headless overrides
+    args["render_freq"] = 0
+    args["task_name"] = task_name
+    args["task_config"] = task_config
+
+    _ROBOTWIN_SETUP_CACHE[task_name] = args
+    return dict(args)
+
+
+def _load_robotwin_task(task_name: str) -> type:
+    """Dynamically import and return a RoboTwin 2.0 task class.
+
+    RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
+    root and are expected to be on ``sys.path`` after installation.
+    """
+    try:
+        module = importlib.import_module(f"envs.{task_name}")
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            f"Could not import RoboTwin task '{task_name}'. "
+            "Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
+            "See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
+        ) from e
+    task_cls = getattr(module, task_name, None)
+    if task_cls is None:
+        raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
+    return task_cls
+
+
+class RoboTwinEnv(gym.Env):
+    """Gymnasium wrapper around a single RoboTwin 2.0 task.
+
+    RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
+    ``take_action`` / ``check_success``) rather than the standard gym interface.
+    This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
+    RoboTwin exactly like LIBERO or Meta-World.
+
+    The underlying SAPIEN environment is created lazily on the first ``reset()``
+    call *inside the worker process*.  This is required for
+    ``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
+    contexts that must not be forked from the parent process.
+
+    Observations
+    ------------
+    The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
+    ``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
+    ``envs/utils.py`` then converts these to ``observation.images.<cam>``.
+
+    Actions
+    -------
+    14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
+
+    Autograd
+    --------
+    ``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
+    optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
+    the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
+
+    def __init__(
+        self,
+        task_name: str,
+        episode_index: int = 0,
+        n_envs: int = 1,
+        camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+        observation_height: int | None = None,
+        observation_width: int | None = None,
+        episode_length: int = DEFAULT_EPISODE_LENGTH,
+        render_mode: str = "rgb_array",
+    ):
+        super().__init__()
+        self.task_name = task_name
+        self.task = task_name  # used by add_envs_task() in utils.py
+        self.task_description = task_name.replace("_", " ")
+        self.episode_index = episode_index
+        self._reset_stride = n_envs
+        self.camera_names = list(camera_names)
+        # Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
+        # The YAML-driven lookup is deferred to reset() so construction doesn't
+        # import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
+        self.observation_height = observation_height or DEFAULT_CAMERA_H
+        self.observation_width = observation_width or DEFAULT_CAMERA_W
+        self.episode_length = episode_length
+        self._max_episode_steps = episode_length  # lerobot_eval.rollout reads this
+        self.render_mode = render_mode
+
+        self._env: Any | None = None  # deferred — created on first reset() inside worker
+        self._step_count: int = 0
+        self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
+
+        image_spaces = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_names
+        }
+        self.observation_space = spaces.Dict(
+            {
+                "pixels": spaces.Dict(image_spaces),
+                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
+            }
+        )
+        self.action_space = spaces.Box(
+            low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the SAPIEN environment on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own EGL/GPU context rather than inheriting a stale one from the
+        parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        task_cls = _load_robotwin_task(self.task_name)
+        self._env = task_cls()
+
+    def _get_obs(self) -> RobotObservation:
+        assert self._env is not None, "_get_obs called before _ensure_env()"
+        raw = self._env.get_obs()
+        cameras_raw = raw.get("observation", {})
+
+        images: dict[str, np.ndarray] = {}
+        for cam in self.camera_names:
+            cam_data = cameras_raw.get(cam)
+            img = cam_data.get("rgb") if cam_data else None
+            if img is None:
+                images[cam] = self._black_frame
+                continue
+            img = np.asarray(img, dtype=np.uint8)
+            if img.ndim == 2:
+                img = np.stack([img, img, img], axis=-1)
+            elif img.shape[-1] != 3:
+                img = img[..., :3]
+            images[cam] = img
+
+        ja = raw.get("joint_action") or {}
+        vec = ja.get("vector")
+        if vec is not None:
+            arr = np.asarray(vec, dtype=np.float32).ravel()
+            joint_state = (
+                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
+            )
+        else:
+            joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
+
+        return {"pixels": images, "agent_pos": joint_state}
+
+    def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
+        self._ensure_env()
+        super().reset(seed=seed)
+        assert self._env is not None  # set by _ensure_env() above
+
+        actual_seed = self.episode_index if seed is None else seed
+        setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
+        setup_kwargs.update(seed=actual_seed, is_test=True)
+        with torch.enable_grad():
+            self._env.setup_demo(**setup_kwargs)
+        self.episode_index += self._reset_stride
+        self._step_count = 0
+
+        obs = self._get_obs()
+        return obs, {"is_success": False, "task": self.task_name}
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        assert self._env is not None, "step() called before reset()"
+        if action.ndim != 1 or action.shape[0] != ACTION_DIM:
+            raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
+
+        with torch.enable_grad():
+            if hasattr(self._env, "take_action"):
+                self._env.take_action(action)
+            else:
+                self._env.step(action)
+
+        self._step_count += 1
+
+        is_success = bool(getattr(self._env, "eval_success", False))
+        if not is_success and hasattr(self._env, "check_success"):
+            is_success = bool(self._env.check_success())
+
+        obs = self._get_obs()
+        reward = float(is_success)
+        terminated = is_success
+        truncated = self._step_count >= self.episode_length
+
+        info: dict[str, Any] = {
+            "task": self.task_name,
+            "is_success": is_success,
+            "step": self._step_count,
+        }
+        if terminated or truncated:
+            info["final_info"] = {
+                "task": self.task_name,
+                "is_success": is_success,
+            }
+            self.reset()
+
+        return obs, reward, terminated, truncated, info
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        obs = self._get_obs()
+        # Prefer head camera for rendering; fall back to first available.
+        if "head_camera" in obs["pixels"]:
+            return obs["pixels"]["head_camera"]
+        return next(iter(obs["pixels"].values()))
+
+    def close(self) -> None:
+        if self._env is not None:
+            if hasattr(self._env, "close_env"):
+                import contextlib
+
+                with contextlib.suppress(TypeError):
+                    self._env.close_env()
+            self._env = None
+
+
+# ---- Multi-task factory --------------------------------------------------------
+
+
+def _make_env_fns(
+    *,
+    task_name: str,
+    n_envs: int,
+    camera_names: list[str],
+    observation_height: int,
+    observation_width: int,
+    episode_length: int,
+) -> list[Callable[[], RoboTwinEnv]]:
+    """Return n_envs factory callables for a single task."""
+
+    def _make_one(episode_index: int) -> RoboTwinEnv:
+        return RoboTwinEnv(
+            task_name=task_name,
+            episode_index=episode_index,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+
+    return [partial(_make_one, i) for i in range(n_envs)]
+
+
+def create_robotwin_envs(
+    task: str,
+    n_envs: int,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+    observation_height: int = DEFAULT_CAMERA_H,
+    observation_width: int = DEFAULT_CAMERA_W,
+    episode_length: int = DEFAULT_EPISODE_LENGTH,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboTwin 2.0 environments.
+
+    Returns:
+        ``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
+        ``n_envs`` parallel rollouts.
+
+    Args:
+        task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
+            or ``"beat_block_hammer,click_bell"``).
+        n_envs: Number of parallel rollouts per task.
+        env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
+        camera_names: Cameras to include in observations.
+        observation_height: Pixel height for all cameras.
+        observation_width: Pixel width for all cameras.
+        episode_length: Max steps before truncation.
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    task_names = [t.strip() for t in str(task).split(",") if t.strip()]
+    if not task_names:
+        raise ValueError("`task` must contain at least one RoboTwin task name.")
+
+    unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
+    if unknown:
+        raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
+
+    logger.info(
+        "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
+        task_names,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task_name=task_name,
+            n_envs=n_envs,
+            camera_names=list(camera_names),
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {k: dict(v) for k, v in out.items()}
@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
 from .configs import EnvConfig


+def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
+    """Normalize ``camera_name`` into a non-empty list of strings.
+
+    Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
+    strings (tuples/lists). Whitespace is stripped; empty entries are
+    dropped. Raises ``TypeError`` for unsupported input types and
+    ``ValueError`` when the normalized list is empty.
+    """
+    if isinstance(camera_name, str):
+        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
+    elif isinstance(camera_name, (list | tuple)):
+        cams = [str(c).strip() for c in camera_name if str(c).strip()]
+    else:
+        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
+    if not cams:
+        raise ValueError("camera_name resolved to an empty list.")
+    return cams
+
+
 def _convert_nested_dict(d):
    result = {}
    for k, v in d.items():
@@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv:
        env_fns: list[Callable],
        observation_space=None,
        action_space=None,
+        metadata=None,
    ):
        self._env_fns = env_fns
        self._env: gym.vector.AsyncVectorEnv | None = None
        self.num_envs = len(env_fns)
-        if observation_space is not None and action_space is not None:
+        if observation_space is not None and action_space is not None and metadata is not None:
            self.observation_space = observation_space
            self.action_space = action_space
+            self.metadata = metadata
        else:
            tmp = env_fns[0]()
            self.observation_space = tmp.observation_space
            self.action_space = tmp.action_space
+            self.metadata = tmp.metadata
            tmp.close()
        self.single_observation_space = self.observation_space
        self.single_action_space = self.action_space
@@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv:
        if self._env is None:
            self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)

+    @property
+    def unwrapped(self):
+        return self
+
    def reset(self, **kwargs):
        self._ensure()
        return self._env.reset(**kwargs)
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VLABench environment wrapper for LeRobot.
+
+VLABench is a large-scale benchmark for language-conditioned robotic manipulation
+with long-horizon reasoning, built on MuJoCo/dm_control.
+
+- Paper: https://arxiv.org/abs/2412.18194
+- GitHub: https://github.com/OpenMOSS/VLABench
+- Website: https://vlabench.github.io
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from typing import Any
+
+import cv2
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+from scipy.spatial.transform import Rotation
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+ACTION_DIM = 7  # pos(3) + euler(3) + gripper(1)
+ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32)
+ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
+
+# Default max episode steps per task type
+DEFAULT_MAX_EPISODE_STEPS = 500
+
+# VLABench task suites
+PRIMITIVE_TASKS = [
+    "select_fruit",
+    "select_toy",
+    "select_chemistry_tube",
+    "add_condiment",
+    "select_book",
+    "select_painting",
+    "select_drink",
+    "insert_flower",
+    "select_billiards",
+    "select_ingredient",
+    "select_mahjong",
+    "select_poker",
+    # Physical series
+    "density_qa",
+    "friction_qa",
+    "magnetism_qa",
+    "reflection_qa",
+    "simple_cuestick_usage",
+    "simple_seesaw_usage",
+    "sound_speed_qa",
+    "thermal_expansion_qa",
+    "weight_qa",
+]
+
+COMPOSITE_TASKS = [
+    "cluster_billiards",
+    "cluster_book",
+    "cluster_drink",
+    "cluster_toy",
+    "cook_dishes",
+    "cool_drink",
+    "find_unseen_object",
+    "get_coffee",
+    "hammer_nail",
+    "heat_food",
+    "make_juice",
+    "play_mahjong",
+    "play_math_game",
+    "play_poker",
+    "play_snooker",
+    "rearrange_book",
+    "rearrange_chemistry_tube",
+    "set_dining_table",
+    "set_study_table",
+    "store_food",
+    "take_chemistry_experiment",
+    "use_seesaw_complex",
+]
+
+SUITE_TASKS: dict[str, list[str]] = {
+    "primitive": PRIMITIVE_TASKS,
+    "composite": COMPOSITE_TASKS,
+}
+
+
+class VLABenchEnv(gym.Env):
+    """Gymnasium wrapper for VLABench environments.
+
+    Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface.
+    Supports multiple cameras (front, second, wrist) and end-effector control.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+    def __init__(
+        self,
+        task: str = "select_fruit",
+        obs_type: str = "pixels_agent_pos",
+        render_mode: str = "rgb_array",
+        render_resolution: tuple[int, int] = (480, 480),
+        robot: str = "franka",
+        max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS,
+        action_mode: str = "eef",
+    ):
+        super().__init__()
+        self.task = task
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.render_resolution = render_resolution
+        self.robot = robot
+        self._max_episode_steps = max_episode_steps
+        self.action_mode = action_mode
+
+        # Deferred — created on first reset() inside worker subprocess to avoid
+        # inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers.
+        # We never cache `env.physics`: dm_control exposes it as a weakref
+        # proxy that goes stale across resets (rebuilds the sim), so we always
+        # refetch it via `self._env.physics` at the call site.
+        self._env = None
+        self.task_description = ""  # populated on first reset
+        # Cached world-frame XYZ of the robot base link. The VLABench datasets
+        # log both `observation.state` positions and `actions` positions in
+        # robot-base frame (see VLABench/scripts/convert_to_lerobot.py which
+        # subtracts `robot_frame_pos` from ee_pos). The robot is attached at a
+        # fixed offset per task so this is safe to cache once per env build.
+        self._robot_base_xyz: np.ndarray | None = None
+
+        h, w = self.render_resolution
+
+        if self.obs_type == "state":
+            raise NotImplementedError(
+                "The 'state' observation type is not supported in VLABenchEnv. "
+                "Please use 'pixels' or 'pixels_agent_pos'."
+            )
+        elif self.obs_type == "pixels":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(
+                        {
+                            "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                        }
+                    ),
+                }
+            )
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(
+                        {
+                            "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                        }
+                    ),
+                    "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64),
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32)
+
+    # Max attempts to rebuild the underlying env when MuJoCo throws
+    # `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step
+    # reset warm-up. Some random task/layout samples land in unstable
+    # initial configurations; re-sampling the layout almost always
+    # gives a stable one. A handful of upstream tasks (notably
+    # `select_mahjong`) have layout samplers that diverge often enough
+    # to need >>5 retries, so we pick a generous ceiling.
+    _ENSURE_ENV_MAX_ATTEMPTS = 20
+
+    def _ensure_env(self) -> None:
+        """Create the underlying VLABench env on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own clean rendering context rather than inheriting a stale one from
+        the parent process (which causes crashes with AsyncVectorEnv).
+
+        Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20
+        warm-up `step()` calls while toggling gravity/fluids to let the scene
+        settle; for some random layouts MuJoCo's integrator diverges and
+        raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields
+        a stable one, so we retry a number of times before giving up. Between
+        attempts we reseed NumPy's global RNG from OS entropy so the upstream
+        task sampler explores fresh initial states — without this, retries
+        can replay the same diverging configuration when the sampler is
+        deterministic given the current RNG state.
+        """
+        if self._env is not None:
+            return
+
+        import VLABench.robots  # noqa: F401  # type: ignore[import-untyped]
+        import VLABench.tasks  # noqa: F401  # type: ignore[import-untyped]
+        from dm_control.rl.control import PhysicsError  # type: ignore[import-untyped]
+        from VLABench.envs import load_env  # type: ignore[import-untyped]
+
+        h, w = self.render_resolution
+        last_exc: PhysicsError | None = None
+        for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1):
+            try:
+                env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w))
+                self._env = env
+                break
+            except PhysicsError as exc:
+                last_exc = exc
+                logger.warning(
+                    "PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…",
+                    attempt,
+                    self._ENSURE_ENV_MAX_ATTEMPTS,
+                    self.task,
+                    exc,
+                )
+                np.random.seed(None)
+        if self._env is None:
+            assert last_exc is not None
+            raise RuntimeError(
+                f"VLABench task '{self.task}' failed to produce a stable "
+                f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} "
+                f"attempts. This task's upstream sampler diverges too "
+                f"often for the configured robot; consider removing it "
+                f"from the eval set. Last physics error: {last_exc}"
+            ) from last_exc
+
+        # Extract task description from the dm_control task
+        task_obj = self._env.task
+        if hasattr(task_obj, "task_description"):
+            self.task_description = task_obj.task_description
+        elif hasattr(task_obj, "language_instruction"):
+            self.task_description = task_obj.language_instruction
+        else:
+            self.task_description = self.task
+
+        # Cache robot base world position so `_build_ctrl_from_action` and
+        # `_get_obs` can translate between robot-frame (dataset) and
+        # world-frame (dm_control) without hitting physics every call.
+        try:
+            self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape(
+                3
+            )
+        except Exception:
+            # Fallback to VLABench's default Franka base position.
+            self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64)
+
+    def _get_obs(self) -> dict:
+        """Get current observation from the environment."""
+        assert self._env is not None
+
+        obs = self._env.get_observation()
+        h, w = self.render_resolution
+
+        def _to_hwc3(arr: np.ndarray) -> np.ndarray:
+            """Coerce any camera array to the declared (h, w, 3) uint8 shape."""
+            a = np.asarray(arr)
+            # Drop a leading singleton batch dim if present.
+            while a.ndim > 3 and a.shape[0] == 1:
+                a = a[0]
+            if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4):
+                # CHW → HWC
+                a = np.transpose(a, (1, 2, 0))
+            if a.ndim == 2:
+                a = np.stack([a] * 3, axis=-1)
+            if a.ndim != 3:
+                return np.zeros((h, w, 3), dtype=np.uint8)
+            # Force 3 channels.
+            if a.shape[-1] == 1:
+                a = np.repeat(a, 3, axis=-1)
+            elif a.shape[-1] == 4:
+                a = a[..., :3]
+            elif a.shape[-1] != 3:
+                return np.zeros((h, w, 3), dtype=np.uint8)
+            if a.shape[:2] != (h, w):
+                a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA)
+            return a.astype(np.uint8)
+
+        # Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays
+        raw_frames: list[np.ndarray] = []
+        if "rgb" in obs:
+            rgb = obs["rgb"]
+            if isinstance(rgb, np.ndarray):
+                if rgb.ndim == 4:
+                    raw_frames = [rgb[i] for i in range(rgb.shape[0])]
+                elif rgb.ndim == 3:
+                    raw_frames = [rgb]
+
+        image_keys = ["image", "second_image", "wrist_image"]
+        images: dict[str, np.ndarray] = {}
+        for i, key in enumerate(image_keys):
+            if i < len(raw_frames):
+                images[key] = _to_hwc3(raw_frames[i])
+            else:
+                images[key] = np.zeros((h, w, 3), dtype=np.uint8)
+
+        # Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]`
+        # to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3),
+        # gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions
+        # are stored in robot-base frame and orientations as scipy extrinsic
+        # 'xyz' euler angles.
+        raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel()
+        pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64)
+        quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64)
+        gripper = float(raw[7]) if raw.size >= 8 else 0.0
+
+        base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+        pos_robot = pos_world - base
+        euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler(
+            "xyz", degrees=False
+        )
+
+        ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64)
+
+        if self.obs_type == "pixels":
+            return {"pixels": images}
+        elif self.obs_type == "pixels_agent_pos":
+            return {
+                "pixels": images,
+                "agent_pos": ee_state.astype(np.float64),
+            }
+        else:
+            raise ValueError(f"Unknown obs_type: {self.obs_type}")
+
+    # ---- Action adaptation (EEF → joint ctrl) --------------------------------
+    #
+    # The HF vlabench datasets log 7D actions
+    # `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`,
+    # exactly matching VLABench's own eval pipeline (evaluator.base):
+    #   pos, euler, g = policy(...)
+    #   quat = euler_to_quaternion(*euler)      # extrinsic xyz -> wxyz
+    #   _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat)
+    #   env.step(np.concatenate([qpos, [g, g]]))
+    #
+    # VLABench's dm_control task writes `data.ctrl[:] = action` directly — for
+    # Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the
+    # above conversion so the policy's EEF commands actually drive the robot.
+
+    _FRANKA_FINGER_OPEN = 0.04  # qpos when gripper fully open
+
+    def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray:
+        """Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector.
+
+        For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) +
+        2 gripper finger qposes (open/closed based on the gripper scalar).
+        If the action is already joint-space (shape matches ctrl_dim), pass
+        through.
+        """
+        if action.shape[0] == ctrl_dim:
+            return action.astype(np.float64, copy=False)
+
+        if action.shape[0] != 7:
+            # Unknown layout — fall back to zero-pad so the sim doesn't crash.
+            padded = np.zeros(ctrl_dim, dtype=np.float64)
+            padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim]
+            return padded
+
+        from dm_control.utils.inverse_kinematics import qpos_from_site_pose
+
+        # Action position is in robot-base frame (see convert_to_lerobot.py);
+        # dm_control's IK expects a world-frame target.
+        base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+        pos_world = np.asarray(action[:3], dtype=np.float64) + base
+        rx, ry, rz = float(action[3]), float(action[4]), float(action[5])
+        gripper = float(np.clip(action[6], 0.0, 1.0))
+
+        # Dataset euler is scipy extrinsic 'xyz' (same as VLABench's
+        # `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK
+        # and MuJoCo use `[w, x, y, z]`, so reorder.
+        qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat()
+        quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64)
+
+        assert self._env is not None
+        robot = self._env.task.robot
+        site_name = robot.end_effector_site.full_identifier
+
+        # inplace=False so IK doesn't mutate physics state mid-step — we only
+        # want the solved qpos. Fetch a fresh physics handle — caching it can
+        # yield a stale weakref after a reset.
+        ik_result = qpos_from_site_pose(
+            self._env.physics,
+            site_name=site_name,
+            target_pos=pos_world,
+            target_quat=quat,
+            inplace=False,
+            max_steps=100,
+        )
+        n_dof = robot.n_dof  # 7 for Franka
+        arm_qpos = ik_result.qpos[:n_dof]
+
+        # Dataset gripper convention: 1 = open (finger qpos = 0.04),
+        # 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py
+        # where `trajectory[i][-1] > 0.03` is encoded as `1`.
+        finger_qpos = gripper * self._FRANKA_FINGER_OPEN
+
+        ctrl = np.zeros(ctrl_dim, dtype=np.float64)
+        ctrl[:n_dof] = arm_qpos
+        # Remaining entries are gripper fingers (usually 2 for Franka).
+        ctrl[n_dof:] = finger_qpos
+        return ctrl
+
+    def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]:
+        self._ensure_env()
+        assert self._env is not None
+        super().reset(seed=seed)
+
+        if seed is not None:
+            self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1)))
+
+        self._env.reset()
+
+        observation = self._get_obs()
+        info = {"is_success": False}
+        return observation, info
+
+    def _seed_inner_env(self, seed: int) -> None:
+        """Propagate `seed` to the inner dm_control env. `Environment.reset()`
+        doesn't accept a seed, so we re-seed the task and environment
+        `RandomState`s directly. Best-effort: silently skipped when the
+        expected attributes are absent on a given VLABench version.
+        """
+        for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")):
+            owner = getattr(self._env, owner_attr) if owner_attr else self._env
+            rng = getattr(owner, rng_attr, None)
+            rng_seed = getattr(rng, "seed", None)
+            if callable(rng_seed):
+                rng_seed(seed)
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        from dm_control.rl.control import PhysicsError  # type: ignore[import-untyped]
+
+        self._ensure_env()
+        assert self._env is not None
+
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
+
+        if self.action_mode not in ("eef", "joint", "delta_eef"):
+            raise ValueError(f"Unknown action_mode: {self.action_mode}")
+
+        # Always refetch physics — dm_control returns a weakref proxy that can
+        # go stale across resets.
+        physics = self._env.physics
+        ctrl_dim = int(physics.data.ctrl.shape[0])
+        ctrl = self._build_ctrl_from_action(action, ctrl_dim)
+        try:
+            timestep = self._env.step(ctrl)
+        except PhysicsError as exc:
+            # Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as
+            # a graceful failed termination rather than a hard crash — the
+            # rest of the multi-task eval should still run.
+            logger.warning(
+                "PhysicsError during step on task '%s': %s. Terminating episode.",
+                self.task,
+                exc,
+            )
+            observation = self._get_obs()
+            info = {"task": self.task, "is_success": False, "physics_error": True}
+            # Drop the stale env so the next reset() rebuilds it cleanly.
+            with contextlib.suppress(Exception):
+                self._env.close()
+            self._env = None
+            return observation, 0.0, True, False, info
+
+        # Extract reward from dm_control timestep
+        reward = float(timestep.reward) if timestep.reward is not None else 0.0
+
+        # Check success via the task's termination condition
+        is_success = False
+        if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"):
+            is_success = bool(self._env.task.should_terminate_episode(self._env.physics))
+
+        terminated = is_success
+        truncated = False
+        info = {
+            "task": self.task,
+            "is_success": is_success,
+        }
+
+        observation = self._get_obs()
+
+        if terminated:
+            self.reset()
+
+        return observation, reward, terminated, truncated, info
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        obs = self._get_obs()
+        return obs["pixels"]["image"]
+
+    def close(self):
+        if self._env is not None:
+            self._env.close()
+            self._env = None
+
+
+# ---- Main API ----------------------------------------------------------------
+
+
+def create_vlabench_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] | None = None,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, Any]]:
+    """
+    Create vectorized VLABench environments with a consistent return shape.
+
+    Returns:
+        dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+    Notes:
+        - n_envs is the number of rollouts *per task*.
+        - `task` can be a suite name ("primitive", "composite"), a comma-separated list of
+          suite names, or individual task names (e.g. "select_fruit,heat_food").
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    gym_kwargs = dict(gym_kwargs or {})
+    task_groups = [t.strip() for t in task.split(",") if t.strip()]
+    if not task_groups:
+        raise ValueError("`task` must contain at least one VLABench task or suite name.")
+
+    logger.info(
+        "Creating VLABench envs | task_groups=%s | n_envs(per task)=%d",
+        task_groups,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space = None
+    cached_act_space = None
+    cached_metadata = None
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+    for group in task_groups:
+        # Check if it's a suite name, otherwise treat as individual task
+        tasks = SUITE_TASKS.get(group, [group])
+
+        for tid, task_name in enumerate(tasks):
+            logger.info(
+                "Building vec env | group=%s | task_id=%d | task=%s",
+                group,
+                tid,
+                task_name,
+            )
+
+            fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
+
+            if is_async:
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+                if cached_obs_space is None:
+                    cached_obs_space = lazy.observation_space
+                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
+                out[group][tid] = lazy
+            else:
+                out[group][tid] = env_cls(fns)
+
+    return {group: dict(task_map) for group, task_map in out.items()}
@@ -12,8 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import numpy as np

+from lerobot.utils.import_utils import _placo_available, require_package
+
+if TYPE_CHECKING or _placo_available:
+    import placo  # type: ignore[import-not-found]
+else:
+    placo = None
+

 class RobotKinematics:
    """Robot kinematics using placo library for forward and inverse kinematics."""
@@ -32,13 +43,7 @@ class RobotKinematics:
            target_frame_name (str): Name of the end-effector frame in the URDF
            joint_names (list[str] | None): List of joint names to use for the kinematics solver
        """
-        try:
-            import placo  # type: ignore[import-not-found] # C++ library with Python bindings, no type stubs available. TODO: Create stub file or request upstream typing support.
-        except ImportError as e:
-            raise ImportError(
-                "placo is required for RobotKinematics. "
-                "Please install the optional dependencies of `kinematics` in the package."
-            ) from e
+        require_package("placo", extra="placo-dep")

        self.robot = placo.RobotWrapper(urdf_path)
        self.solver = placo.KinematicsSolver(self.robot)
@@ -24,7 +24,7 @@ from functools import cached_property
 from typing import TYPE_CHECKING, Any, TypedDict

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _can_available
+from lerobot.utils.import_utils import _can_available, require_package

 if TYPE_CHECKING or _can_available:
    import can
@@ -111,6 +111,7 @@ class DamiaoMotorsBus(MotorsBusBase):
            bitrate: Nominal bitrate in bps (default: 1000000 = 1 Mbps)
            data_bitrate: Data bitrate for CAN FD in bps (default: 5000000 = 5 Mbps), ignored if use_can_fd is False
        """
+        require_package("python-can", extra="damiao", import_name="can")
        super().__init__(port, motors, calibration)
        self.port = port
        self.can_interface = can_interface
@@ -356,8 +356,8 @@ class SerialMotorsBus(MotorsBusBase):
        motors: dict[str, Motor],
        calibration: dict[str, MotorCalibration] | None = None,
    ):
-        require_package("pyserial", extra="hardware", import_name="serial")
-        require_package("deepdiff", extra="hardware")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
+        require_package("deepdiff", extra="deepdiff-dep")
        super().__init__(port, motors, calibration)

        self.port_handler: PortHandler
@@ -23,12 +23,12 @@ from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, TypedDict

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _can_available
+from lerobot.utils.import_utils import _can_available, require_package

 if TYPE_CHECKING or _can_available:
    import can
 else:
-    can = SimpleNamespace(Message=object, interface=None)
+    can = SimpleNamespace(Message=object, interface=None, BusABC=object)
 import numpy as np

 from lerobot.utils.errors import DeviceNotConnectedError
@@ -106,6 +106,7 @@ class RobstrideMotorsBus(MotorsBusBase):
            bitrate: Nominal bitrate in bps (default: 1000000 = 1 Mbps)
            data_bitrate: Data bitrate for CAN FD in bps (default: 5000000 = 5 Mbps), ignored if use_can_fd is False
        """
+        require_package("python-can", extra="robstride", import_name="can")
        super().__init__(port, motors, calibration)
        self.port = port
        self.can_interface = can_interface
@@ -18,14 +18,21 @@ import logging
 import math
 from dataclasses import asdict, dataclass
 from pathlib import Path
+from typing import TYPE_CHECKING

 import draccus
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler

 from lerobot.utils.constants import SCHEDULER_STATE
+from lerobot.utils.import_utils import _diffusers_available, require_package
 from lerobot.utils.io_utils import deserialize_json_into_object, write_json

+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.optimization import get_scheduler
+else:
+    get_scheduler = None
+

@dataclass
 class LRSchedulerConfig(draccus.ChoiceRegistry, abc.ABC):
@@ -47,10 +54,7 @@ class DiffuserSchedulerConfig(LRSchedulerConfig):
    num_warmup_steps: int | None = None

    def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
-        from lerobot.utils.import_utils import require_package
-
        require_package("diffusers", extra="diffusion")
-        from diffusers.optimization import get_scheduler

        kwargs = {**asdict(self), "num_training_steps": num_training_steps, "optimizer": optimizer}
        return get_scheduler(**kwargs)
@@ -142,9 +142,10 @@ class ACTPolicy(PreTrainedPolicy):

        actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)

-        l1_loss = (
-            F.l1_loss(batch[ACTION], actions_hat, reduction="none") * ~batch["action_is_pad"].unsqueeze(-1)
-        ).mean()
+        abs_err = F.l1_loss(batch[ACTION], actions_hat, reduction="none")
+        valid_mask = ~batch["action_is_pad"].unsqueeze(-1)
+        num_valid = valid_mask.sum() * abs_err.shape[-1]
+        l1_loss = (abs_err * valid_mask).sum() / num_valid.clamp_min(1)

        loss_dict = {"l1_loss": l1_loss.item()}
        if self.config.use_vae:
@@ -23,6 +23,7 @@ TODO(alexander-soare):
 import math
 from collections import deque
 from collections.abc import Callable
+from typing import TYPE_CHECKING

 import einops
 import numpy as np
@@ -32,6 +33,14 @@ import torchvision
 from torch import Tensor, nn

 from lerobot.utils.constants import ACTION, OBS_ENV_STATE, OBS_IMAGES, OBS_STATE
+from lerobot.utils.import_utils import _diffusers_available, require_package
+
+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+else:
+    DDIMScheduler = None
+    DDPMScheduler = None

 from ..pretrained import PreTrainedPolicy
 from ..utils import (
@@ -64,6 +73,7 @@ class DiffusionPolicy(PreTrainedPolicy):
            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
                that they will be passed with a call to `load_state_dict` before the policy is used.
        """
+        require_package("diffusers", extra="diffusion")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -155,11 +165,7 @@ def _make_noise_scheduler(name: str, **kwargs: dict):
    Factory for noise scheduler instances of the requested type. All kwargs are passed
    to the scheduler.
    """
-    from lerobot.utils.import_utils import require_package
-
    require_package("diffusers", extra="diffusion")
-    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler

    if name == "DDPM":
        return DDPMScheduler(**kwargs)
@@ -374,7 +380,9 @@ class DiffusionModel(nn.Module):
                    f"{self.config.do_mask_loss_for_padding=}."
                )
            in_episode_bound = ~batch["action_is_pad"]
-            loss = loss * in_episode_bound.unsqueeze(-1)
+            mask = in_episode_bound.unsqueeze(-1)
+            num_valid = mask.sum() * loss.shape[-1]
+            return (loss * mask).sum() / num_valid.clamp_min(1)

        return loss.mean()

@@ -204,7 +204,9 @@ class FlowmatchingActionHead(nn.Module):
            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)

-        self.beta_dist = Beta(config.noise_beta_alpha, config.noise_beta_beta)
+        self._noise_beta_alpha = config.noise_beta_alpha
+        self._noise_beta_beta = config.noise_beta_beta
+        self._beta_dist = None
        self.num_timestep_buckets = config.num_timestep_buckets
        self.config = config
        self.set_trainable_parameters(config.tune_projector, config.tune_diffusion_model)
@@ -249,7 +251,9 @@ class FlowmatchingActionHead(nn.Module):
                self.model.eval()

    def sample_time(self, batch_size, device, dtype):
-        sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype)
+        if self._beta_dist is None:
+            self._beta_dist = Beta(self._noise_beta_alpha, self._noise_beta_beta, validate_args=False)
+        sample = self._beta_dist.sample([batch_size]).to(device, dtype=dtype)
        return (self.config.noise_s - sample) / self.config.noise_s

    def prepare_input(self, batch: dict) -> BatchFeature:
@@ -222,6 +222,13 @@ class Eagle25VLProcessor(ProcessorMixin):
                        videos=None,
                        **output_kwargs["images_kwargs"],
                    )
+                    if isinstance(image_inputs["pixel_values"], list):
+                        _pv = image_inputs["pixel_values"]
+                        if _pv and isinstance(_pv[0], list):
+                            _pv = [t for sub in _pv for t in sub]
+                        image_inputs["pixel_values"] = torch.stack(
+                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
+                        )
                    num_all_tiles = image_inputs["pixel_values"].shape[0]
                    special_placeholder = f"<image {idx_in_list + 1}>{self.image_start_token}{self.image_token * num_all_tiles * self.tokens_per_tile}{self.image_end_token}"
                    unified_frame_list.append(image_inputs)
@@ -233,6 +240,13 @@ class Eagle25VLProcessor(ProcessorMixin):
                        videos=[video_list[idx_in_list]],
                        **output_kwargs["videos_kwargs"],
                    )
+                    if isinstance(video_inputs["pixel_values"], list):
+                        _pv = video_inputs["pixel_values"]
+                        if _pv and isinstance(_pv[0], list):
+                            _pv = [t for sub in _pv for t in sub]
+                        video_inputs["pixel_values"] = torch.stack(
+                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
+                        )
                    num_all_tiles = video_inputs["pixel_values"].shape[0]
                    image_sizes = video_inputs["image_sizes"]
                    if timestamps_list is not None and -1 not in timestamps_list:
@@ -288,8 +302,18 @@ class Eagle25VLProcessor(ProcessorMixin):

        text = replace_in_text(text)
        if len(unified_frame_list) > 0:
-            pixel_values = torch.cat([frame["pixel_values"] for frame in unified_frame_list])
-            image_sizes = torch.cat([frame["image_sizes"] for frame in unified_frame_list])
+
+            def _to_tensor(v):
+                if isinstance(v, torch.Tensor):
+                    return v
+                if isinstance(v, list):
+                    if v and isinstance(v[0], list):
+                        v = [t for sub in v for t in sub]
+                    return torch.stack([t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in v])
+                return torch.as_tensor(v)
+
+            pixel_values = torch.cat([_to_tensor(frame["pixel_values"]) for frame in unified_frame_list])
+            image_sizes = torch.cat([_to_tensor(frame["image_sizes"]) for frame in unified_frame_list])
        else:
            pixel_values = None
            image_sizes = None
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING

@@ -174,17 +173,14 @@ N_COLOR_CHANNELS = 3


 # config
-@dataclass
 class GR00TN15Config(PretrainedConfig):
    model_type = "gr00t_n1_5"
-    backbone_cfg: dict = field(init=False, metadata={"help": "Backbone configuration."})

-    action_head_cfg: dict = field(init=False, metadata={"help": "Action head configuration."})
-
-    action_horizon: int = field(init=False, metadata={"help": "Action horizon."})
-
-    action_dim: int = field(init=False, metadata={"help": "Action dimension."})
-    compute_dtype: str = field(default="float32", metadata={"help": "Compute dtype."})
+    backbone_cfg: dict
+    action_head_cfg: dict
+    action_horizon: int
+    action_dim: int
+    compute_dtype: str = "float32"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
@@ -221,6 +217,7 @@ class GR00TN15(PreTrainedModel):
        self.action_horizon = config.action_horizon
        self.action_dim = config.action_dim
        self.compute_dtype = config.compute_dtype
+        self.post_init()

    def validate_inputs(self, inputs):
        # NOTE -- this should be handled internally by the model
@@ -43,6 +43,7 @@ from torch import Tensor

 from lerobot.configs import FeatureType, PolicyFeature
 from lerobot.utils.constants import ACTION, OBS_IMAGES
+from lerobot.utils.import_utils import require_package

 from ..pretrained import PreTrainedPolicy
 from .configuration_groot import GrootConfig
@@ -59,6 +60,7 @@ class GrootPolicy(PreTrainedPolicy):

    def __init__(self, config: GrootConfig, **kwargs):
        """Initialize Groot policy wrapper."""
+        require_package("transformers", extra="groot")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -36,7 +36,7 @@ import torch.nn.functional as F  # noqa: N812
 import torchvision
 from torch import Tensor

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _diffusers_available, _transformers_available, require_package

 from .configuration_multi_task_dit import MultiTaskDiTConfig

@@ -46,6 +46,13 @@ if TYPE_CHECKING or _transformers_available:
 else:
    CLIPTextModel = None
    CLIPVisionModel = None
+
+if TYPE_CHECKING or _diffusers_available:
+    from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+    from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+else:
+    DDIMScheduler = None
+    DDPMScheduler = None
 from lerobot.utils.constants import (
    ACTION,
    OBS_IMAGES,
@@ -65,6 +72,8 @@ class MultiTaskDiTPolicy(PreTrainedPolicy):
    name = "multi_task_dit"

    def __init__(self, config: MultiTaskDiTConfig, **kwargs):
+        require_package("transformers", extra="multi_task_dit")
+        require_package("diffusers", extra="multi_task_dit")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -643,12 +652,6 @@ class DiffusionObjective(nn.Module):
            "prediction_type": config.prediction_type,
        }

-        from lerobot.utils.import_utils import require_package
-
-        require_package("diffusers", extra="multi_task_dit")
-        from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-        from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
-
        if config.noise_scheduler_type == "DDPM":
            self.noise_scheduler: DDPMScheduler | DDIMScheduler = DDPMScheduler(**scheduler_kwargs)
        elif config.noise_scheduler_type == "DDIM":
@@ -685,8 +688,9 @@ class DiffusionObjective(nn.Module):
        loss = F.mse_loss(predicted, target, reduction="none")

        if self.do_mask_loss_for_padding and "action_is_pad" in batch:
-            valid_actions = ~batch["action_is_pad"]
-            loss = loss * valid_actions.unsqueeze(-1)
+            mask = ~batch["action_is_pad"].unsqueeze(-1)
+            num_valid = mask.sum() * loss.shape[-1]
+            return (loss * mask).sum() / num_valid.clamp_min(1)

        return loss.mean()

@@ -749,8 +753,9 @@ class FlowMatchingObjective(nn.Module):
        loss = F.mse_loss(predicted_velocity, target_velocity, reduction="none")

        if self.do_mask_loss_for_padding and "action_is_pad" in batch:
-            valid_mask = ~batch["action_is_pad"]
-            loss = loss * valid_mask.unsqueeze(-1)
+            mask = ~batch["action_is_pad"].unsqueeze(-1)
+            num_valid = mask.sum() * loss.shape[-1]
+            return (loss * mask).sum() / num_valid.clamp_min(1)

        return loss.mean()

@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
@@ -947,6 +947,7 @@ class PI0Policy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _transformers_available
+from lerobot.utils.import_utils import _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
@@ -918,6 +918,7 @@ class PI05Policy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -26,7 +26,7 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn

-from lerobot.utils.import_utils import _scipy_available, _transformers_available
+from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _scipy_available:
@@ -35,7 +35,7 @@ else:
    idct = None

 if TYPE_CHECKING or _transformers_available:
-    from transformers import AutoTokenizer
+    from transformers import AutoProcessor, AutoTokenizer
    from transformers.models.auto import CONFIG_MAPPING

    from ..pi_gemma import (
@@ -44,6 +44,7 @@ if TYPE_CHECKING or _transformers_available:
    )
 else:
    CONFIG_MAPPING = None
+    AutoProcessor = None
    AutoTokenizer = None
    PiGemmaModel = None
    PaliGemmaForConditionalGenerationWithPiGemma = None
@@ -826,14 +827,14 @@ class PI0FastPolicy(PreTrainedPolicy):
        Args:
            config: Policy configuration class instance.
        """
+        require_package("transformers", extra="pi")
+        require_package("scipy", extra="pi")
        super().__init__(config)
        config.validate_features()
        self.config = config

        # Load tokenizers first
        try:
-            from transformers import AutoProcessor, AutoTokenizer
-
            # Load FAST tokenizer
            self.action_tokenizer = AutoProcessor.from_pretrained(
                config.action_tokenizer_name, trust_remote_code=True
@@ -455,7 +455,13 @@ class SARMEncodingProcessorStep(ProcessorStep):
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Get image embeddings
-            embeddings = self.clip_model.get_image_features(**inputs).detach().cpu()
+            # transformers 5.x returns BaseModelOutputWithPooling instead of a plain tensor
+            output = self.clip_model.get_image_features(**inputs)
+            if not isinstance(output, torch.Tensor):
+                output = output.pooler_output
+                if output is None:
+                    raise ValueError("pooler_output should not be None for CLIP models.")
+            embeddings = output.detach().cpu()

            # Handle single frame case
            if embeddings.dim() == 1:
@@ -482,7 +488,13 @@ class SARMEncodingProcessorStep(ProcessorStep):
        inputs = self.clip_processor.tokenizer([text], return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

-        text_embedding = self.clip_model.get_text_features(**inputs).detach().cpu()
+        # transformers 5.x returns BaseModelOutputWithPooling instead of a plain tensor
+        output = self.clip_model.get_text_features(**inputs)
+        if not isinstance(output, torch.Tensor):
+            output = output.pooler_output
+            if output is None:
+                raise ValueError("pooler_output should not be None for CLIP models.")
+        text_embedding = output.detach().cpu()
        text_embedding = text_embedding.expand(batch_size, -1)

        return text_embedding
@@ -62,6 +62,7 @@ from torch import Tensor, nn

 from lerobot.utils.constants import ACTION, OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS, OBS_STATE
 from lerobot.utils.device_utils import get_safe_dtype
+from lerobot.utils.import_utils import require_package

 from ..pretrained import PreTrainedPolicy
 from ..rtc.modeling_rtc import RTCProcessor
@@ -239,6 +240,7 @@ class SmolVLAPolicy(PreTrainedPolicy):
                    the configuration class is used.
        """

+        require_package("transformers", extra="smolvla")
        super().__init__(config)
        config.validate_features()
        self.config = config
@@ -392,13 +394,21 @@ class SmolVLAPolicy(PreTrainedPolicy):
        loss_dict["losses_after_rm_padding"] = losses.clone().mean().item()

        if reduction == "none":
-            # Return per-sample losses (B,) by averaging over time and action dims
-            per_sample_loss = losses.mean(dim=(1, 2))
+            # Return per-sample losses (B,) by averaging over valid (time, action) entries
+            if actions_is_pad is None:
+                per_sample_loss = losses.mean(dim=(1, 2))
+            else:
+                num_valid = ((~actions_is_pad).sum(dim=1) * losses.shape[-1]).clamp_min(1)
+                per_sample_loss = losses.sum(dim=(1, 2)) / num_valid
            loss_dict["loss"] = per_sample_loss.mean().item()
            return per_sample_loss, loss_dict
        else:
-            # Default: return scalar mean loss
-            loss = losses.mean()
+            # Default: return scalar mean loss over valid (time, action) entries
+            if actions_is_pad is None:
+                loss = losses.mean()
+            else:
+                num_valid = ((~actions_is_pad).sum() * losses.shape[-1]).clamp_min(1)
+                loss = losses.sum() / num_valid
            loss_dict["loss"] = loss.item()
            return loss, loss_dict

@@ -27,7 +27,7 @@ import torch.distributed as distributed
 import torch.nn.functional as F  # noqa: N812
 from einops import pack, rearrange, reduce, repeat, unpack
 from torch import einsum, nn
-from torch.cuda.amp import autocast
+from torch.amp import autocast
 from torch.optim import Optimizer

 from .configuration_vqbet import VQBeTConfig
@@ -1370,7 +1370,7 @@ class EuclideanCodebook(nn.Module):
        batch_samples = rearrange(batch_samples, "h ... d -> h (...) d")
        self.replace(batch_samples, batch_mask=expired_codes)

-    @autocast(enabled=False)
+    @autocast("cuda", enabled=False)
    def forward(self, x, sample_codebook_temp=None, mask=None, freeze_codebook=False):
        needs_codebook_dim = x.ndim < 4
        sample_codebook_temp = (
@@ -321,6 +321,7 @@ class GymHILAdapterProcessorStep(ProcessorStep):
    This step normalizes the `transition` object by:
    1. Copying `teleop_action` from `info` to `complementary_data`.
    2. Copying `is_intervention` from `info` (using the string key) to `info` (using the enum key).
+    3. Copying `discrete_penalty` from `info` to `complementary_data`.
    """

    def __call__(self, transition: EnvTransition) -> EnvTransition:
@@ -330,6 +331,9 @@ class GymHILAdapterProcessorStep(ProcessorStep):
        if TELEOP_ACTION_KEY in info:
            complementary_data[TELEOP_ACTION_KEY] = info[TELEOP_ACTION_KEY]

+        if DISCRETE_PENALTY_KEY in info:
+            complementary_data[DISCRETE_PENALTY_KEY] = info[DISCRETE_PENALTY_KEY]
+
        if "is_intervention" in info:
            info[TeleopEvents.IS_INTERVENTION] = info["is_intervention"]

@@ -348,18 +352,24 @@ class GymHILAdapterProcessorStep(ProcessorStep):
@ProcessorStepRegistry.register("gripper_penalty_processor")
 class GripperPenaltyProcessorStep(ProcessorStep):
    """
-    Applies a penalty for inefficient gripper usage.
+    Applies a small per-transition cost on the discrete gripper action.

-    This step penalizes actions that attempt to close an already closed gripper or
-    open an already open one, based on position thresholds.
+    Fires only when the commanded action would actually transition the gripper
+    from one extreme to the other (close-while-open or open-while-closed).
+    This discourages gripper oscillation while leaving "stay" and saturating-further
+    commands unpenalized.

    Attributes:
        penalty: The negative reward value to apply.
        max_gripper_pos: The maximum position value for the gripper, used for normalization.
+        open_threshold: Normalized state below which the gripper is considered "open".
+        closed_threshold: Normalized state above which the gripper is considered "closed".
    """

-    penalty: float = -0.01
+    penalty: float = -0.02
    max_gripper_pos: float = 30.0
+    open_threshold: float = 0.1
+    closed_threshold: float = 0.9

    def __call__(self, transition: EnvTransition) -> EnvTransition:
        """
@@ -391,9 +401,13 @@ class GripperPenaltyProcessorStep(ProcessorStep):
        gripper_state_normalized = current_gripper_pos / self.max_gripper_pos

        # Calculate penalty boolean as in original
-        gripper_penalty_bool = (gripper_state_normalized < 0.5 and gripper_action_normalized > 0.5) or (
-            gripper_state_normalized > 0.75 and gripper_action_normalized < 0.5
-        )
+        #   - currently open  AND target is closed  -> close transition
+        #   - currently closed AND target is open   -> open transition
+        is_open = gripper_state_normalized < self.open_threshold
+        is_closed = gripper_state_normalized > self.closed_threshold
+        cmd_close = gripper_action_normalized > self.closed_threshold
+        cmd_open = gripper_action_normalized < self.open_threshold
+        gripper_penalty_bool = (is_open and cmd_close) or (is_closed and cmd_open)

        gripper_penalty = self.penalty * int(gripper_penalty_bool)

@@ -409,11 +423,14 @@ class GripperPenaltyProcessorStep(ProcessorStep):
        Returns the configuration of the step for serialization.

        Returns:
-            A dictionary containing the penalty value and max gripper position.
+            A dictionary containing the penalty value, max gripper position,
+            and the open/closed thresholds.
        """
        return {
            "penalty": self.penalty,
            "max_gripper_pos": self.max_gripper_pos,
+            "open_threshold": self.open_threshold,
+            "closed_threshold": self.closed_threshold,
        }

    def reset(self) -> None:
@@ -134,6 +134,15 @@ class _NormalizationMixin:
        if self.dtype is None:
            self.dtype = torch.float32
        self._tensor_stats = to_tensor(self.stats, device=self.device, dtype=self.dtype)
+        self._reshape_visual_stats()
+
+    def _reshape_visual_stats(self) -> None:
+        """Reshape visual stats from ``[C]`` to ``[C, 1, 1]`` for image broadcasting."""
+        for key, feature in self.features.items():
+            if feature.type == FeatureType.VISUAL and key in self._tensor_stats:
+                for stat_name, stat_tensor in self._tensor_stats[key].items():
+                    if isinstance(stat_tensor, Tensor) and stat_tensor.ndim == 1:
+                        self._tensor_stats[key][stat_name] = stat_tensor.reshape(-1, 1, 1)

    def to(
        self, device: torch.device | str | None = None, dtype: torch.dtype | None = None
@@ -152,6 +161,7 @@ class _NormalizationMixin:
        if dtype is not None:
            self.dtype = dtype
        self._tensor_stats = to_tensor(self.stats, device=self.device, dtype=self.dtype)
+        self._reshape_visual_stats()
        return self

    def state_dict(self) -> dict[str, Tensor]:
@@ -201,6 +211,7 @@ class _NormalizationMixin:
            # Don't load from state_dict, keep the explicitly provided stats
            # But ensure _tensor_stats is properly initialized
            self._tensor_stats = to_tensor(self.stats, device=self.device, dtype=self.dtype)  # type: ignore[assignment]
+            self._reshape_visual_stats()
            return

        # Normal behavior: load stats from state_dict
@@ -211,6 +222,7 @@ class _NormalizationMixin:
            self._tensor_stats.setdefault(key, {})[stat_name] = tensor.to(
                dtype=torch.float32, device=self.device
            )
+        self._reshape_visual_stats()

        # Reconstruct the original stats dict from tensor stats for compatibility with to() method
        # and other functions that rely on self.stats
@@ -60,7 +60,7 @@ from torch.multiprocessing import Event, Queue
 from lerobot.cameras import opencv  # noqa: F401
 from lerobot.configs import parser
 from lerobot.configs.train import TrainRLServerPipelineConfig
-from lerobot.policies import make_policy
+from lerobot.policies import make_policy, make_pre_post_processors
 from lerobot.policies.sac.modeling_sac import SACPolicy
 from lerobot.robots import so_follower  # noqa: F401
 from lerobot.teleoperators import gamepad, so_leader  # noqa: F401
@@ -89,9 +89,9 @@ from lerobot.utils.utils import (
 )

 from .gym_manipulator import (
-    create_transition,
    make_processors,
    make_robot_env,
+    reset_and_build_transition,
    step_env_and_process_transition,
 )
 from .process import ProcessSignalHandler
@@ -261,13 +261,12 @@ def act_with_policy(
    policy = policy.eval()
    assert isinstance(policy, nn.Module)

-    obs, info = online_env.reset()
-    env_processor.reset()
-    action_processor.reset()
+    preprocessor, postprocessor = make_pre_post_processors(
+        policy_cfg=cfg.policy,
+        dataset_stats=cfg.policy.dataset_stats,
+    )

-    # Process initial observation
-    transition = create_transition(observation=obs, info=info)
-    transition = env_processor(transition)
+    transition = reset_and_build_transition(online_env, env_processor, action_processor)

    # NOTE: For the moment we will solely handle the case of a single environment
    sum_reward_episode = 0
@@ -291,8 +290,21 @@ def act_with_policy(

        # Time policy inference and check if it meets FPS requirement
        with policy_timer:
-            # Extract observation from transition for policy
-            action = policy.select_action(batch=observation)
+            normalized_observation = preprocessor.process_observation(observation)
+            action = policy.select_action(batch=normalized_observation)
+            # Unnormalize only the continuous part. When `num_discrete_actions` is set,
+            # `select_action` concatenates an argmax index in env space at the last dim;
+            # action stats cover the continuous dims only, so feeding the full vector to
+            # the unnormalizer would shape-mismatch and would also corrupt the discrete
+            # index by treating it as a normalized value.
+            if cfg.policy.num_discrete_actions is not None:
+                continuous_action = postprocessor.process_action(action[..., :-1])
+                discrete_action = action[..., -1:].to(
+                    device=continuous_action.device, dtype=continuous_action.dtype
+                )
+                action = torch.cat([continuous_action, discrete_action], dim=-1)
+            else:
+                action = postprocessor.process_action(action)
        policy_fps = policy_timer.fps_last

        log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step)
@@ -326,7 +338,8 @@ def act_with_policy(

        # Check for intervention from transition info
        intervention_info = new_transition[TransitionKey.INFO]
-        if intervention_info.get(TeleopEvents.IS_INTERVENTION, False):
+        is_intervention = bool(intervention_info.get(TeleopEvents.IS_INTERVENTION, False))
+        if is_intervention:
            episode_intervention = True
            episode_intervention_steps += 1

@@ -334,6 +347,10 @@ def act_with_policy(
            "discrete_penalty": torch.tensor(
                [new_transition[TransitionKey.COMPLEMENTARY_DATA].get("discrete_penalty", 0.0)]
            ),
+            # Forward the intervention flag so the learner can route this transition
+            # into the offline replay buffer (see `process_transitions` in learner.py).
+            # Use the plain string key so the payload survives torch.load(weights_only=True).
+            TeleopEvents.IS_INTERVENTION.value: is_intervention,
        }
        # Create transition for learner (convert to old format)
        list_transition_to_send_to_learner.append(
@@ -390,14 +407,7 @@ def act_with_policy(
            episode_intervention_steps = 0
            episode_total_steps = 0

-            # Reset environment and processors
-            obs, info = online_env.reset()
-            env_processor.reset()
-            action_processor.reset()
-
-            # Process initial observation
-            transition = create_transition(observation=obs, info=info)
-            transition = env_processor(transition)
+            transition = reset_and_build_transition(online_env, env_processor, action_processor)

        if cfg.env.fps is not None:
            dt_time = time.perf_counter() - start_time
@@ -15,6 +15,7 @@
 # limitations under the License.

 import functools
+import threading
 from collections.abc import Callable, Sequence
 from contextlib import suppress
 from typing import TypedDict
@@ -115,6 +116,7 @@ class ReplayBuffer:
        self.size = 0
        self.initialized = False
        self.optimize_memory = optimize_memory
+        self._lock = threading.Lock()

        # Track episode boundaries for memory optimization
        self.episode_ends = torch.zeros(capacity, dtype=torch.bool, device=storage_device)
@@ -198,68 +200,75 @@ class ReplayBuffer:
        complementary_info: dict[str, torch.Tensor] | None = None,
    ):
        """Saves a transition, ensuring tensors are stored on the designated storage device."""
-        # Initialize storage if this is the first transition
-        if not self.initialized:
-            self._initialize_storage(state=state, action=action, complementary_info=complementary_info)
+        with self._lock:
+            # Initialize storage if this is the first transition
+            if not self.initialized:
+                self._initialize_storage(state=state, action=action, complementary_info=complementary_info)

-        # Store the transition in pre-allocated tensors
-        for key in self.states:
-            self.states[key][self.position].copy_(state[key].squeeze(dim=0))
+            # Store the transition in pre-allocated tensors
+            for key in self.states:
+                self.states[key][self.position].copy_(state[key].squeeze(dim=0))

-            if not self.optimize_memory:
-                # Only store next_states if not optimizing memory
-                self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))
+                if not self.optimize_memory:
+                    # Only store next_states if not optimizing memory
+                    self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))

-        self.actions[self.position].copy_(action.squeeze(dim=0))
-        self.rewards[self.position] = reward
-        self.dones[self.position] = done
-        self.truncateds[self.position] = truncated
+            self.actions[self.position].copy_(action.squeeze(dim=0))
+            self.rewards[self.position] = reward
+            self.dones[self.position] = done
+            self.truncateds[self.position] = truncated

-        # Handle complementary_info if provided and storage is initialized
-        if complementary_info is not None and self.has_complementary_info:
-            # Store the complementary_info
-            for key in self.complementary_info_keys:
-                if key in complementary_info:
-                    value = complementary_info[key]
-                    if isinstance(value, torch.Tensor):
-                        self.complementary_info[key][self.position].copy_(value.squeeze(dim=0))
-                    elif isinstance(value, (int | float)):
-                        self.complementary_info[key][self.position] = value
+            # Handle complementary_info if provided and storage is initialized
+            if complementary_info is not None and self.has_complementary_info:
+                for key in self.complementary_info_keys:
+                    if key in complementary_info:
+                        value = complementary_info[key]
+                        if isinstance(value, torch.Tensor):
+                            self.complementary_info[key][self.position].copy_(value.squeeze(dim=0))
+                        elif isinstance(value, (int | float)):
+                            self.complementary_info[key][self.position] = value

-        self.position = (self.position + 1) % self.capacity
-        self.size = min(self.size + 1, self.capacity)
+            self.position = (self.position + 1) % self.capacity
+            self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size: int) -> BatchTransition:
        """Sample a random batch of transitions and collate them into batched tensors."""
        if not self.initialized:
            raise RuntimeError("Cannot sample from an empty buffer. Add transitions first.")

-        batch_size = min(batch_size, self.size)
-        high = max(0, self.size - 1) if self.optimize_memory and self.size < self.capacity else self.size
+        with self._lock:
+            batch_size = min(batch_size, self.size)
+            high = max(0, self.size - 1) if self.optimize_memory and self.size < self.capacity else self.size

-        # Random indices for sampling - create on the same device as storage
-        idx = torch.randint(low=0, high=high, size=(batch_size,), device=self.storage_device)
+            idx = torch.randint(low=0, high=high, size=(batch_size,), device=self.storage_device)

-        # Identify image keys that need augmentation
-        image_keys = [k for k in self.states if k.startswith(OBS_IMAGE)] if self.use_drq else []
+            image_keys = [k for k in self.states if k.startswith(OBS_IMAGE)] if self.use_drq else []

-        # Create batched state and next_state
-        batch_state = {}
-        batch_next_state = {}
+            batch_state = {}
+            batch_next_state = {}

-        # First pass: load all state tensors to target device
-        for key in self.states:
-            batch_state[key] = self.states[key][idx].to(self.device)
+            for key in self.states:
+                batch_state[key] = self.states[key][idx].to(self.device)

-            if not self.optimize_memory:
-                # Standard approach - load next_states directly
-                batch_next_state[key] = self.next_states[key][idx].to(self.device)
-            else:
-                # Memory-optimized approach - get next_state from the next index
-                next_idx = (idx + 1) % self.capacity
-                batch_next_state[key] = self.states[key][next_idx].to(self.device)
+                if not self.optimize_memory:
+                    batch_next_state[key] = self.next_states[key][idx].to(self.device)
+                else:
+                    next_idx = (idx + 1) % self.capacity
+                    batch_next_state[key] = self.states[key][next_idx].to(self.device)
+
+            # Sample other tensors
+            batch_actions = self.actions[idx].to(self.device)
+            batch_rewards = self.rewards[idx].to(self.device)
+            batch_dones = self.dones[idx].to(self.device).float()
+            batch_truncateds = self.truncateds[idx].to(self.device).float()
+
+            # Sample complementary_info if available
+            batch_complementary_info = None
+            if self.has_complementary_info:
+                batch_complementary_info = {}
+                for key in self.complementary_info_keys:
+                    batch_complementary_info[key] = self.complementary_info[key][idx].to(self.device)

-        # Apply image augmentation in a batched way if needed
        if self.use_drq and image_keys:
            # Concatenate all images from state and next_state
            all_images = []
@@ -280,19 +289,6 @@ class ReplayBuffer:
                # Next states start after the states at index (i*2+1)*batch_size and also take up batch_size slots
                batch_next_state[key] = augmented_images[(i * 2 + 1) * batch_size : (i + 1) * 2 * batch_size]

-        # Sample other tensors
-        batch_actions = self.actions[idx].to(self.device)
-        batch_rewards = self.rewards[idx].to(self.device)
-        batch_dones = self.dones[idx].to(self.device).float()
-        batch_truncateds = self.truncateds[idx].to(self.device).float()
-
-        # Sample complementary_info if available
-        batch_complementary_info = None
-        if self.has_complementary_info:
-            batch_complementary_info = {}
-            for key in self.complementary_info_keys:
-                batch_complementary_info[key] = self.complementary_info[key][idx].to(self.device)
-
        return BatchTransition(
            state=batch_state,
            action=batch_actions,
@@ -383,10 +383,21 @@ def make_processors(
            GymHILAdapterProcessorStep(),
            Numpy2TorchActionProcessorStep(),
            VanillaObservationProcessorStep(),
-            AddBatchDimensionProcessorStep(),
-            DeviceProcessorStep(device=device),
        ]

+        # Add time limit processor if reset config exists
+        if cfg.processor.reset is not None:
+            env_pipeline_steps.append(
+                TimeLimitProcessorStep(max_episode_steps=int(cfg.processor.reset.control_time_s * cfg.fps))
+            )
+
+        env_pipeline_steps.extend(
+            [
+                AddBatchDimensionProcessorStep(),
+                DeviceProcessorStep(device=device),
+            ]
+        )
+
        return DataProcessorPipeline(
            steps=env_pipeline_steps, to_transition=identity_transition, to_output=identity_transition
        ), DataProcessorPipeline(
@@ -551,8 +562,19 @@ def step_env_and_process_transition(
    terminated = terminated or processed_action_transition[TransitionKey.DONE]
    truncated = truncated or processed_action_transition[TransitionKey.TRUNCATED]
    complementary_data = processed_action_transition[TransitionKey.COMPLEMENTARY_DATA].copy()
-    new_info = processed_action_transition[TransitionKey.INFO].copy()
-    new_info.update(info)
+
+    if hasattr(env, "get_raw_joint_positions"):
+        raw_joint_positions = env.get_raw_joint_positions()
+        if raw_joint_positions is not None:
+            complementary_data["raw_joint_positions"] = raw_joint_positions
+
+    # Merge env and action-processor info: env wins for str keys, action-processor
+    # wins for `TeleopEvents` enum keys
+    action_info = processed_action_transition[TransitionKey.INFO]
+    new_info = info.copy()
+    for key, value in action_info.items():
+        if isinstance(key, TeleopEvents):
+            new_info[key] = value

    new_transition = create_transition(
        observation=obs,
@@ -568,6 +590,24 @@ def step_env_and_process_transition(
    return new_transition


+def reset_and_build_transition(
+    env: gym.Env,
+    env_processor: DataProcessorPipeline[EnvTransition, EnvTransition],
+    action_processor: DataProcessorPipeline[EnvTransition, EnvTransition],
+) -> EnvTransition:
+    """Reset env + processors and return the first env-processed transition."""
+    obs, info = env.reset()
+    env_processor.reset()
+    action_processor.reset()
+    complementary_data: dict[str, Any] = {}
+    if hasattr(env, "get_raw_joint_positions"):
+        raw_joint_positions = env.get_raw_joint_positions()
+        if raw_joint_positions is not None:
+            complementary_data["raw_joint_positions"] = raw_joint_positions
+    transition = create_transition(observation=obs, info=info, complementary_data=complementary_data)
+    return env_processor(data=transition)
+
+
 def control_loop(
    env: gym.Env,
    env_processor: DataProcessorPipeline[EnvTransition, EnvTransition],
@@ -593,17 +633,7 @@ def control_loop(
    print("- When not intervening, robot will stay still")
    print("- Press Ctrl+C to exit")

-    # Reset environment and processors
-    obs, info = env.reset()
-    complementary_data = (
-        {"raw_joint_positions": info.pop("raw_joint_positions")} if "raw_joint_positions" in info else {}
-    )
-    env_processor.reset()
-    action_processor.reset()
-
-    # Process initial observation
-    transition = create_transition(observation=obs, info=info, complementary_data=complementary_data)
-    transition = env_processor(data=transition)
+    transition = reset_and_build_transition(env, env_processor, action_processor)

    # Determine if gripper is used
    use_gripper = cfg.env.processor.gripper.use_gripper if cfg.env.processor.gripper is not None else True
@@ -665,7 +695,7 @@ def control_loop(
        # Create a neutral action (no movement)
        neutral_action = torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32)
        if use_gripper:
-            neutral_action = torch.cat([neutral_action, torch.tensor([0.0])])  # Gripper stay
+            neutral_action = torch.cat([neutral_action, torch.tensor([1.0])])  # Gripper stay

        # Use the new step function
        transition = step_env_and_process_transition(
@@ -723,12 +753,7 @@ def control_loop(
                    dataset.save_episode()

            # Reset for new episode
-            obs, info = env.reset()
-            env_processor.reset()
-            action_processor.reset()
-
-            transition = create_transition(observation=obs, info=info)
-            transition = env_processor(transition)
+            transition = reset_and_build_transition(env, env_processor, action_processor)

        # Maintain fps timing
        precise_sleep(max(dt - (time.perf_counter() - step_start_time), 0.0))
@@ -70,7 +70,7 @@ from lerobot.common.wandb_utils import WandBLogger
 from lerobot.configs import parser
 from lerobot.configs.train import TrainRLServerPipelineConfig
 from lerobot.datasets import LeRobotDataset, make_dataset
-from lerobot.policies import make_policy
+from lerobot.policies import make_policy, make_pre_post_processors
 from lerobot.policies.sac.modeling_sac import SACPolicy
 from lerobot.robots import so_follower  # noqa: F401
 from lerobot.teleoperators import gamepad, so_leader  # noqa: F401
@@ -317,6 +317,11 @@ def add_actor_information_and_train(

    policy.train()

+    preprocessor, _postprocessor = make_pre_post_processors(
+        policy_cfg=cfg.policy,
+        dataset_stats=cfg.policy.dataset_stats,
+    )
+
    push_actor_policy_to_queue(parameters_queue=parameters_queue, policy=policy)

    last_time_policy_pushed = time.time()
@@ -405,8 +410,8 @@ def add_actor_information_and_train(

            actions = batch[ACTION]
            rewards = batch["reward"]
-            observations = batch["state"]
-            next_observations = batch["next_state"]
+            observations = preprocessor.process_observation(batch["state"])
+            next_observations = preprocessor.process_observation(batch["next_state"])
            done = batch["done"]
            check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)

@@ -463,8 +468,8 @@ def add_actor_information_and_train(

        actions = batch[ACTION]
        rewards = batch["reward"]
-        observations = batch["state"]
-        next_observations = batch["next_state"]
+        observations = preprocessor.process_observation(batch["state"])
+        next_observations = preprocessor.process_observation(batch["next_state"])
        done = batch["done"]

        check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)
@@ -1163,7 +1168,7 @@ def process_transitions(

            # Add to offline buffer if it's an intervention
            if dataset_repo_id is not None and transition.get("complementary_info", {}).get(
-                TeleopEvents.IS_INTERVENTION
+                TeleopEvents.IS_INTERVENTION.value
            ):
                offline_replay_buffer.add(**transition)

@@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, Any

 from lerobot.cameras import make_cameras_from_configs
 from lerobot.types import RobotAction, RobotObservation
-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 from ..robot import Robot
 from ..utils import ensure_safe_goal_position
@@ -81,6 +81,7 @@ class Reachy2Robot(Robot):
    name = "reachy2"

    def __init__(self, config: Reachy2RobotConfig):
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -353,7 +353,8 @@ class GripperVelocityToJoint(RobotActionProcessorStep):
        speed_factor: A scaling factor to convert the normalized velocity command to a position change.
        clip_min: The minimum allowed gripper joint position.
        clip_max: The maximum allowed gripper joint position.
-        discrete_gripper: If True, treat the input action as discrete (0: open, 1: close, 2: stay).
+        discrete_gripper: If True, interpret the input as a discrete class index
+            {0 = close, 1 = stay, 2 = open}, matching `GamepadTeleop.GripperAction`.
    """

    speed_factor: float = 20.0
@@ -377,10 +378,10 @@ class GripperVelocityToJoint(RobotActionProcessorStep):
            raise ValueError("Joints observation is require for computing robot kinematics")

        if self.discrete_gripper:
-            # Discrete gripper actions are in [0, 1, 2]
-            # 0: open, 1: close, 2: stay
-            # We need to shift them to [-1, 0, 1] and then scale them to clip_max
-            gripper_vel = (gripper_vel - 1) * self.clip_max
+            # Map discrete command {0=close, 1=stay, 2=open} -> signed velocity.
+            # Negation accounts for SO100 sign (joint position increases on close).
+            #   0 -> +clip_max (close), 1 -> 0 (stay), 2 -> -clip_max (open)
+            gripper_vel = -(gripper_vel - 1) * self.clip_max

        # Compute desired gripper position
        delta = gripper_vel * float(self.speed_factor)
@@ -27,7 +27,7 @@ import numpy as np

 from lerobot.cameras import make_cameras_from_configs
 from lerobot.types import RobotAction, RobotObservation
-from lerobot.utils.import_utils import _unitree_sdk_available
+from lerobot.utils.import_utils import _unitree_sdk_available, require_package

 from ..robot import Robot
 from .config_unitree_g1 import UnitreeG1Config
@@ -111,6 +111,7 @@ class UnitreeG1(Robot):
    name = "unitree_g1"

    def __init__(self, config: UnitreeG1Config):
+        require_package("unitree-sdk2py", extra="unitree_g1", import_name="unitree_sdk2py")
        super().__init__(config)

        logger.info("Initialize UnitreeG1...")
@@ -286,7 +286,7 @@ def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int):
    if len(set(num_eps_per_cam)) != 1:
        raise ValueError(f"All cams dont have same number of episodes ({num_eps_per_cam}).")

-    episods_metadata = []
+    episodes_metadata = []
    num_cameras = len(video_keys)
    num_episodes = num_eps_per_cam[0]
    for ep_idx in tqdm.tqdm(range(num_episodes), desc="convert videos"):
@@ -299,9 +299,9 @@ def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int):
        ep_dict = {}
        for cam_idx in range(num_cameras):
            ep_dict.update(eps_metadata_per_cam[cam_idx][ep_idx])
-        episods_metadata.append(ep_dict)
+        episodes_metadata.append(ep_dict)

-    return episods_metadata
+    return episodes_metadata


 def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_file_size_in_mb: int):
@@ -150,11 +150,24 @@ Show dataset information without feature details:
        --operation.type info \
        --operation.show_features false

-Recompute dataset statistics:
+Recompute dataset statistics (saves to lerobot/pusht_recomputed_stats by default):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type recompute_stats

+Recompute stats and save to a specific new repo_id:
+    lerobot-edit-dataset \
+        --repo_id lerobot/pusht \
+        --new_repo_id lerobot/pusht_new_stats \
+        --operation.type recompute_stats
+
+Recompute stats in-place (overwrites original dataset stats):
+    lerobot-edit-dataset \
+        --repo_id lerobot/pusht \
+        --new_repo_id lerobot/pusht \
+        --operation.type recompute_stats \
+        --operation.overwrite true
+
 Recompute stats for relative actions and push to hub:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
@@ -256,6 +269,7 @@ class RecomputeStatsConfig(OperationConfig):
    relative_exclude_joints: list[str] | None = None
    chunk_size: int = 50
    num_workers: int = 0
+    overwrite: bool = False


@OperationConfig.register_subclass("info")
@@ -280,16 +294,30 @@ class EditDatasetConfig:
    push_to_hub: bool = False


+def _resolve_io_paths(
+    repo_id: str,
+    new_repo_id: str | None,
+    root: Path | str | None,
+    new_root: Path | str | None,
+    default_new_repo_id: str | None = None,
+) -> tuple[str, Path, Path]:
+    """Resolve input/output paths and repo_id for dataset operations.
+
+    Returns (output_repo_id, input_path, output_path) with resolved (symlink-safe) paths.
+    """
+    input_path = (Path(root) if root else HF_LEROBOT_HOME / repo_id).resolve()
+    output_repo_id = new_repo_id or default_new_repo_id or repo_id
+    output_path = (Path(new_root) if new_root else HF_LEROBOT_HOME / output_repo_id).resolve()
+    return output_repo_id, input_path, output_path
+
+
 def get_output_path(
    repo_id: str,
    new_repo_id: str | None,
    root: Path | str | None,
    new_root: Path | str | None,
 ) -> tuple[str, Path]:
-    input_path = Path(root) if root else HF_LEROBOT_HOME / repo_id
-
-    output_repo_id = new_repo_id if new_repo_id else repo_id
-    output_path = Path(new_root) if new_root else HF_LEROBOT_HOME / output_repo_id
+    output_repo_id, input_path, output_path = _resolve_io_paths(repo_id, new_repo_id, root, new_root)

    # In case of in-place modification, create a backup of the original dataset (if it exists)
    if output_path == input_path:
@@ -557,7 +585,39 @@ def handle_recompute_stats(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, RecomputeStatsConfig):
        raise ValueError("Operation config must be RecomputeStatsConfig")

-    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
+    # Determine whether this is an in-place operation
+    output_repo_id, input_root, output_root = _resolve_io_paths(
+        cfg.repo_id,
+        cfg.new_repo_id,
+        cfg.root,
+        cfg.new_root,
+        default_new_repo_id=f"{cfg.repo_id}_recomputed_stats",
+    )
+    in_place = output_root == input_root
+
+    if in_place and not cfg.operation.overwrite:
+        raise ValueError(
+            f"recompute_stats would overwrite the dataset in-place at {input_root}. "
+            "Pass --operation.overwrite true to allow in-place modification, "
+            "or use --new_repo_id / --new_root to write to a different location. "
+            f"Default output repo_id when neither is set: '{cfg.repo_id}_recomputed_stats'."
+        )
+
+    if in_place:
+        logging.warning(
+            f"Overwriting dataset stats in-place at {input_root}. The original stats will be lost."
+        )
+        dataset = LeRobotDataset(cfg.repo_id, root=input_root)
+    else:
+        logging.info(f"Copying dataset from {input_root} to {output_root}")
+        if output_root.exists():
+            backup_path = output_root.with_name(output_root.name + "_old")
+            logging.warning(f"Output directory {output_root} already exists. Moving to {backup_path}")
+            if backup_path.exists():
+                shutil.rmtree(backup_path)
+            shutil.move(output_root, backup_path)
+        shutil.copytree(input_root, output_root)
+        dataset = LeRobotDataset(output_repo_id, root=output_root)

    logging.info(f"Recomputing stats for {cfg.repo_id}")
    if cfg.operation.relative_action:
@@ -578,7 +638,7 @@ def handle_recompute_stats(cfg: EditDatasetConfig) -> None:
    logging.info(f"Stats written to {dataset.root}")

    if cfg.push_to_hub:
-        logging.info(f"Pushing to hub as {dataset.meta.repo_id}...")
+        logging.info(f"Pushing to hub as {dataset.repo_id}...")
        dataset.push_to_hub()


@@ -572,7 +572,7 @@ def eval_main(cfg: EvalPipelineConfig):
            preprocessor=preprocessor,
            postprocessor=postprocessor,
            n_episodes=cfg.eval.n_episodes,
-            max_episodes_rendered=cfg.eval.max_episodes_rendered,
+            max_episodes_rendered=10,
            videos_dir=Path(cfg.output_dir) / "videos",
            start_seed=cfg.seed,
            max_parallel_tasks=cfg.env.max_parallel_tasks,
@@ -559,7 +559,11 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
            )

        # Load pretrained policy
-        policy = None if cfg.policy is None else make_policy(cfg.policy, ds_meta=dataset.meta)
+        policy = (
+            None
+            if cfg.policy is None
+            else make_policy(cfg.policy, ds_meta=dataset.meta, rename_map=cfg.dataset.rename_map)
+        )
        preprocessor = None
        postprocessor = None
        interpolator = None
@@ -71,9 +71,6 @@ def update_policy(
    lr_scheduler=None,
    lock=None,
    rabc_weights_provider=None,
-    *,
-    do_optimizer_step: bool = True,
-    loss_divisor: int = 1,
 ) -> tuple[MetricsTracker, dict]:
    """
    Performs a single training step to update the policy's weights.
@@ -125,38 +122,34 @@ def update_policy(
            loss, output_dict = policy.forward(batch)

        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
-        logged_loss = loss.detach()
-        if loss_divisor > 1:
-            loss = loss / loss_divisor

    # Use accelerator's backward method
    accelerator.backward(loss)

-    grad_norm_value = 0.0
-    if do_optimizer_step:
-        if grad_clip_norm > 0:
-            grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
-        else:
-            grad_norm = torch.nn.utils.clip_grad_norm_(
-                policy.parameters(), float("inf"), error_if_nonfinite=False
-            )
-        grad_norm_value = grad_norm.item()
+    # Clip gradients if specified
+    if grad_clip_norm > 0:
+        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+    else:
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            policy.parameters(), float("inf"), error_if_nonfinite=False
+        )

-        with lock if lock is not None else nullcontext():
-            optimizer.step()
+    # Optimizer step
+    with lock if lock is not None else nullcontext():
+        optimizer.step()

-        optimizer.zero_grad()
+    optimizer.zero_grad()

-        # Step through pytorch scheduler at every optimizer step instead of epoch
-        if lr_scheduler is not None:
-            lr_scheduler.step()
+    # Step through pytorch scheduler at every batch instead of epoch
+    if lr_scheduler is not None:
+        lr_scheduler.step()

-        # Update internal buffers if policy has update method
-        if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
-            accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()
+    # Update internal buffers if policy has update method
+    if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
+        accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()

-    train_metrics.loss = logged_loss.item()
-    train_metrics.grad_norm = grad_norm_value
+    train_metrics.loss = loss.item()
+    train_metrics.grad_norm = grad_norm.item()
    train_metrics.lr = optimizer.param_groups[0]["lr"]
    train_metrics.update_s = time.perf_counter() - start_time
    return train_metrics, output_dict
@@ -366,16 +359,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
        logging.info(f"{dataset.num_episodes=}")
        num_processes = accelerator.num_processes
-        micro_batch = cfg.batch_size
-        logical_batch = cfg.batch_size * cfg.gradient_accumulation_steps
-        effective_bs = logical_batch * num_processes
-        logging.info(
-            "Effective batch size: %s x %s x %s = %s",
-            micro_batch,
-            cfg.gradient_accumulation_steps,
-            num_processes,
-            effective_bs,
-        )
+        effective_bs = cfg.batch_size * num_processes
+        logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")

@@ -401,7 +386,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        sampler=sampler,
        pin_memory=device.type == "cuda",
        drop_last=False,
-        prefetch_factor=2 if cfg.num_workers > 0 else None,
+        prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
+        persistent_workers=cfg.persistent_workers and cfg.num_workers > 0,
    )

    # Prepare everything with accelerator
@@ -422,10 +408,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
    }

    # Keep global batch size for logging; MetricsTracker handles world size internally.
-    logical_batch_size = cfg.batch_size * cfg.gradient_accumulation_steps
-    effective_batch_size = logical_batch_size * accelerator.num_processes
+    effective_batch_size = cfg.batch_size * accelerator.num_processes
    train_tracker = MetricsTracker(
-        logical_batch_size,
+        cfg.batch_size,
        dataset.num_frames,
        dataset.num_episodes,
        train_metrics,
@@ -447,62 +432,24 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        )

    for _ in range(step, cfg.steps):
-        step_dataloading_s = 0.0
-        step_update_s = 0.0
-        step_losses = []
-        step_grad_norm = 0.0
-        step_lr = optimizer.param_groups[0]["lr"]
-        output_dict = {}
-        optimizer.zero_grad()
-        for accumulation_idx in range(cfg.gradient_accumulation_steps):
-            start_time = time.perf_counter()
-            batch = next(dl_iter)
-            batch = preprocessor(batch)
-            step_dataloading_s += time.perf_counter() - start_time
+        start_time = time.perf_counter()
+        batch = next(dl_iter)
+        for cam_key in dataset.meta.camera_keys:
+            if cam_key in batch and batch[cam_key].dtype == torch.uint8:
+                batch[cam_key] = batch[cam_key].to(dtype=torch.float32) / 255.0
+        batch = preprocessor(batch)
+        train_tracker.dataloading_s = time.perf_counter() - start_time

-            is_last_microbatch = accumulation_idx == cfg.gradient_accumulation_steps - 1
-            micro_metrics = MetricsTracker(
-                cfg.batch_size,
-                dataset.num_frames,
-                dataset.num_episodes,
-                {
-                    "loss": AverageMeter("loss", ":.3f"),
-                    "grad_norm": AverageMeter("grdn", ":.3f"),
-                    "lr": AverageMeter("lr", ":0.1e"),
-                    "update_s": AverageMeter("updt_s", ":.3f"),
-                },
-                accelerator=accelerator,
-            )
-            sync_context = (
-                nullcontext()
-                if is_last_microbatch or accelerator.num_processes == 1
-                else accelerator.no_sync(policy)
-            )
-            with sync_context:
-                micro_metrics, micro_output_dict = update_policy(
-                    micro_metrics,
-                    policy,
-                    batch,
-                    optimizer,
-                    cfg.optimizer.grad_clip_norm,
-                    accelerator=accelerator,
-                    lr_scheduler=lr_scheduler if is_last_microbatch else None,
-                    rabc_weights_provider=rabc_weights,
-                    do_optimizer_step=is_last_microbatch,
-                    loss_divisor=cfg.gradient_accumulation_steps,
-                )
-            step_update_s += micro_metrics.update_s.val
-            step_losses.append(micro_metrics.loss.val)
-            if is_last_microbatch:
-                step_grad_norm = micro_metrics.grad_norm.val
-                step_lr = micro_metrics.lr.val
-                output_dict = micro_output_dict
-
-        train_tracker.loss = sum(step_losses) / len(step_losses)
-        train_tracker.grad_norm = step_grad_norm
-        train_tracker.lr = step_lr
-        train_tracker.update_s = step_update_s
-        train_tracker.dataloading_s = step_dataloading_s
+        train_tracker, output_dict = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            cfg.optimizer.grad_clip_norm,
+            accelerator=accelerator,
+            lr_scheduler=lr_scheduler,
+            rabc_weights_provider=rabc_weights,
+        )

        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
        # increment `step` here.
@@ -567,7 +514,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                        postprocessor=postprocessor,
                        n_episodes=cfg.eval.n_episodes,
                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=cfg.eval.max_episodes_rendered,
+                        max_episodes_rendered=4,
                        start_seed=cfg.seed,
                        max_parallel_tasks=cfg.env.max_parallel_tasks,
                    )
@@ -598,9 +545,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                if wandb_logger:
                    wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
                    wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
-                    video_paths = eval_info["overall"].get("video_paths", [])
-                    if video_paths:
-                        wandb_logger.log_video(video_paths[0], step, mode="eval")
+                    wandb_logger.log_video(eval_info["overall"]["video_paths"][0], step, mode="eval")

            accelerator.wait_for_everyone()

@@ -15,9 +15,22 @@
 # limitations under the License.

 import logging
+from typing import TYPE_CHECKING
+
+from lerobot.utils.import_utils import _hidapi_available, _pygame_available, require_package

 from ..utils import TeleopEvents

+if TYPE_CHECKING or _pygame_available:
+    import pygame
+else:
+    pygame = None  # type: ignore[assignment]
+
+if TYPE_CHECKING or _hidapi_available:
+    import hid
+else:
+    hid = None  # type: ignore[assignment]
+

 class InputController:
    """Base class for input controllers that generate motion deltas."""
@@ -199,6 +212,7 @@ class GamepadController(InputController):
    """Generate motion deltas from gamepad input."""

    def __init__(self, x_step_size=1.0, y_step_size=1.0, z_step_size=1.0, deadzone=0.1):
+        require_package("pygame", extra="gamepad")
        super().__init__(x_step_size, y_step_size, z_step_size)
        self.deadzone = deadzone
        self.joystick = None
@@ -206,8 +220,6 @@ class GamepadController(InputController):

    def start(self):
        """Initialize pygame and the gamepad."""
-        import pygame
-
        pygame.init()
        pygame.joystick.init()

@@ -230,8 +242,6 @@ class GamepadController(InputController):

    def stop(self):
        """Clean up pygame resources."""
-        import pygame
-
        if pygame.joystick.get_init():
            if self.joystick:
                self.joystick.quit()
@@ -240,8 +250,6 @@ class GamepadController(InputController):

    def update(self):
        """Process pygame events to get fresh gamepad readings."""
-        import pygame
-
        for event in pygame.event.get():
            if event.type == pygame.JOYBUTTONDOWN:
                if event.button == 3:
@@ -280,8 +288,6 @@ class GamepadController(InputController):

    def get_deltas(self):
        """Get the current movement deltas from gamepad state."""
-        import pygame
-
        try:
            # Read joystick axes
            # Left stick X and Y (typically axes 0 and 1)
@@ -326,6 +332,7 @@ class GamepadControllerHID(InputController):
            z_scale: Scaling factor for Z-axis movement
            deadzone: Joystick deadzone to prevent drift
        """
+        require_package("hidapi", extra="gamepad", import_name="hid")
        super().__init__(x_step_size, y_step_size, z_step_size)
        self.deadzone = deadzone
        self.device = None
@@ -342,8 +349,6 @@ class GamepadControllerHID(InputController):

    def find_device(self):
        """Look for the gamepad device by vendor and product ID."""
-        import hid
-
        devices = hid.enumerate()
        for device in devices:
            device_name = device["product_string"]
@@ -357,8 +362,6 @@ class GamepadControllerHID(InputController):

    def start(self):
        """Connect to the gamepad using HIDAPI."""
-        import hid
-
        self.device_info = self.find_device()
        if not self.device_info:
            self.running = False
@@ -45,7 +45,7 @@ class HomunculusArm(Teleoperator):
    name = "homunculus_arm"

    def __init__(self, config: HomunculusArmConfig):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
        super().__init__(config)
        self.config = config
        self.serial = serial.Serial(config.port, config.baud_rate, timeout=1)
@@ -71,7 +71,7 @@ class HomunculusGlove(Teleoperator):
    name = "homunculus_glove"

    def __init__(self, config: HomunculusGloveConfig):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="pyserial-dep", import_name="serial")
        super().__init__(config)
        self.config = config
        self.serial = serial.Serial(config.port, config.baud_rate, timeout=1)
@@ -23,7 +23,7 @@ from typing import Any

 from lerobot.types import RobotAction
 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
-from lerobot.utils.import_utils import _pynput_available
+from lerobot.utils.import_utils import _pynput_available, require_package

 from ..teleoperator import Teleoperator
 from ..utils import TeleopEvents
@@ -56,6 +56,7 @@ class KeyboardTeleop(Teleoperator):
    name = "keyboard"

    def __init__(self, config: KeyboardTeleopConfig):
+        require_package("pynput", extra="pynput-dep")
        super().__init__(config)
        self.config = config
        self.robot_type = config.type
@@ -21,14 +21,24 @@
 import logging
 import threading
 import time
+from typing import TYPE_CHECKING

-import hebi
 import numpy as np
-from teleop import Teleop

 from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+from lerobot.utils.import_utils import _hebi_available, _teleop_available, require_package
 from lerobot.utils.rotation import Rotation

+if TYPE_CHECKING or _hebi_available:
+    import hebi
+else:
+    hebi = None
+
+if TYPE_CHECKING or _teleop_available:
+    from teleop import Teleop
+else:
+    Teleop = None
+
 from ..teleoperator import Teleoperator
 from .config_phone import PhoneConfig, PhoneOS

@@ -74,6 +84,8 @@ class IOSPhone(BasePhone, Teleoperator):
    name = "ios_phone"

    def __init__(self, config: PhoneConfig):
+        require_package("hebi-py", extra="phone", import_name="hebi")
+        require_package("teleop", extra="phone")
        super().__init__(config)
        self.config = config
        self._group = None
@@ -213,6 +225,8 @@ class AndroidPhone(BasePhone, Teleoperator):
    name = "android_phone"

    def __init__(self, config: PhoneConfig):
+        require_package("hebi-py", extra="phone", import_name="hebi")
+        require_package("teleop", extra="phone")
        super().__init__(config)
        self.config = config
        self._teleop = None
@@ -19,7 +19,7 @@ import logging
 import time
 from typing import TYPE_CHECKING

-from lerobot.utils.import_utils import _reachy2_sdk_available
+from lerobot.utils.import_utils import _reachy2_sdk_available, require_package

 if TYPE_CHECKING or _reachy2_sdk_available:
    from reachy2_sdk import ReachySDK
@@ -84,6 +84,7 @@ class Reachy2Teleoperator(Teleoperator):
    name = "reachy2_specific"

    def __init__(self, config: Reachy2TeleoperatorConfig):
+        require_package("reachy2_sdk", extra="reachy2")
        super().__init__(config)

        self.config = config
@@ -34,7 +34,7 @@ from typing import TYPE_CHECKING

 import numpy as np

-from lerobot.utils.import_utils import _serial_available
+from lerobot.utils.import_utils import _serial_available, require_package

 if TYPE_CHECKING or _serial_available:
    import serial
@@ -156,6 +156,7 @@ def run_exo_calibration(
    """
    Run interactive calibration for an exoskeleton arm.
    """
+    require_package("pyserial", extra="unitree_g1", import_name="serial")
    try:
        import cv2
        import matplotlib.pyplot as plt
@@ -76,7 +76,7 @@ class ExoskeletonArm:
    calibration: ExoskeletonCalibration | None = None

    def __post_init__(self):
-        require_package("pyserial", extra="hardware", import_name="serial")
+        require_package("pyserial", extra="unitree_g1", import_name="serial")
        if self.calibration_fpath.is_file():
            self._load_calibration()

@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import json
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any
-
-from huggingface_hub import HfApi
-
-
-def utc_timestamp_slug(now: datetime | None = None) -> str:
-    current = now or datetime.now(UTC)
-    return current.strftime("%Y%m%dT%H%M%SZ")
-
-
-def make_hub_file_url(repo_id: str, path_in_repo: str, repo_type: str = "dataset") -> str:
-    prefix = "datasets/" if repo_type == "dataset" else ""
-    return f"https://huggingface.co/{prefix}{repo_id}/resolve/main/{path_in_repo}"
-
-
-def write_json(path: Path, payload: dict[str, Any]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps(payload, indent=2, sort_keys=True))
-
-
-@dataclass(frozen=True)
-class UploadTarget:
-    local_path: Path
-    path_in_repo: str
-
-
-def upload_targets(
-    repo_id: str,
-    targets: list[UploadTarget],
-    *,
-    repo_type: str = "dataset",
-    token: str | None = None,
-    private: bool | None = None,
-    commit_message: str | None = None,
-) -> dict[str, str]:
-    api = HfApi(token=token)
-    api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
-    uploaded: dict[str, str] = {}
-    for target in targets:
-        api.upload_file(
-            path_or_fileobj=str(target.local_path),
-            path_in_repo=target.path_in_repo,
-            repo_id=repo_id,
-            repo_type=repo_type,
-            commit_message=commit_message or f"Upload {target.path_in_repo}",
-        )
-        uploaded[target.path_in_repo] = make_hub_file_url(repo_id, target.path_in_repo, repo_type=repo_type)
-    return uploaded
@@ -115,6 +115,14 @@ _feetech_sdk_available = is_package_available("feetech-servo-sdk", import_name="
 _reachy2_sdk_available = is_package_available("reachy2_sdk")
 _can_available = is_package_available("python-can", "can")
 _unitree_sdk_available = is_package_available("unitree-sdk2py", "unitree_sdk2py")
+_pyrealsense2_available = is_package_available("pyrealsense2") or is_package_available(
+    "pyrealsense2-macosx", import_name="pyrealsense2"
+)
+_zmq_available = is_package_available("pyzmq", import_name="zmq")
+_hebi_available = is_package_available("hebi-py", import_name="hebi")
+_teleop_available = is_package_available("teleop")
+_placo_available = is_package_available("placo")
+_hidapi_available = is_package_available("hidapi", import_name="hid")

 # Data / serialization
 _pandas_available = is_package_available("pandas")
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2b8f8532c7a0b776de5e536b8b54e30b1a0c2e3d5cc25a2d86fe43e40ae5e8c
+oid sha256:8a31653c11eccdd4d80fd3f6a351cd54c49b8a48db1f7e9faf38fddd7900a09f
 size 515400
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:224b5fa4828aa88171b68c036e8919c1eae563e2113f03b6461eadf5bf8525a6
+oid sha256:75bf051698b37dcd7517ec8025a896ab5a0551a6dde5f89d0a3d5d50966e83e6
 size 31672
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`