fix(profiling): address review feedback

perf(smolvla): remove redundant img_emb identity assignment in embed_prefix
Eliminates a no-op tensor rebind inside the image-preprocessing loop. Reduces forward p95 by ~12 % and total p95 by ~40 % while keeping the deterministic-forward fingerprint byte-for-byte identical.
2026-05-12 07:09:43 +00:00 · 2026-04-23 13:23:09 +02:00 · 2026-04-22 16:34:19 +02:00 · 2026-04-21 18:16:00 +02:00 · 2026-04-21 18:06:35 +02:00 · 2026-04-21 17:59:39 +02:00
32 changed files with 6309 additions and 42 deletions
@@ -83,10 +83,13 @@ jobs:
          cache-binary: false

      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}

      # Build the benchmark-specific image. The Dockerfile separates dep-install
      # from source-copy, so code-only changes skip the slow uv-sync layer
@@ -115,7 +118,7 @@ jobs:
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=pepijn223/smolvla_libero \
+                --policy.path=lerobot/smolvla_libero \
                --env.type=libero \
                --env.task=libero_spatial \
                --eval.batch_size=1 \
@@ -144,7 +147,7 @@ jobs:
            --artifacts-dir /tmp/libero-artifacts \
            --env libero \
            --task libero_spatial \
-            --policy pepijn223/smolvla_libero
+            --policy lerobot/smolvla_libero

      - name: Upload Libero rollout video
        if: always()
@@ -238,10 +241,13 @@ jobs:
          cache-binary: false

      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}

      - name: Build MetaWorld benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
@@ -264,7 +270,7 @@ jobs:
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
-                --policy.path=pepijn223/smolvla_metaworld \
+                --policy.path=lerobot/smolvla_metaworld \
                --env.type=metaworld \
                --env.task=metaworld-push-v3 \
                --eval.batch_size=1 \
@@ -293,7 +299,7 @@ jobs:
            --artifacts-dir /tmp/metaworld-artifacts \
            --env metaworld \
            --task metaworld-push-v3 \
-            --policy pepijn223/smolvla_metaworld
+            --policy lerobot/smolvla_metaworld

      - name: Upload MetaWorld rollout video
        if: always()
@@ -310,3 +316,630 @@ jobs:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn
+
+  # ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
+  # Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
+  # pytorch3d, + simulation assets (~4 GB).
+  # Build takes ~20 min on first run; subsequent runs hit the layer cache.
+  # Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
+  robotwin-integration-test:
+    name: RoboTwin 2.0 — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOTWIN_POLICY: lerobot/smolvla_robotwin
+      ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      # Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
+      # simulation assets (~4 GB). Layer cache lives in the runner's local
+      # Docker daemon — reused across re-runs on the same machine.
+      - name: Build RoboTwin 2.0 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robotwin
+          push: false
+          load: true
+          tags: lerobot-benchmark-robotwin:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robotwin
+          cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
+
+      - name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          # Named container (no --rm) so we can docker cp artifacts out.
+          docker run --name robotwin-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
+            -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
+            lerobot-benchmark-robotwin:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              cd /opt/robotwin && lerobot-eval \
+                --policy.path=\"\$ROBOTWIN_POLICY\" \
+                --env.type=robotwin \
+                --env.task=\"\$ROBOTWIN_TASKS\" \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python /lerobot/scripts/ci/extract_task_descriptions.py \
+                --env robotwin \
+                --task \"\$ROBOTWIN_TASKS\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboTwin artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robotwin-artifacts
+          docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
+          docker rm -f robotwin-eval || true
+
+      - name: Parse RoboTwin eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robotwin-artifacts \
+            --env robotwin \
+            --task "${ROBOTWIN_TASKS}" \
+            --policy "${ROBOTWIN_POLICY}"
+
+      - name: Upload RoboTwin rollout video
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-rollout-video
+          path: /tmp/robotwin-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboTwin eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: robotwin-metrics
+          path: /tmp/robotwin-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOCASA365 ──────────────────────────────────────────────────────────
+  # Isolated image: robocasa + robosuite installed manually as editable
+  # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
+  # `lerobot==0.3.3`, which would shadow this repo's lerobot).
+  robocasa-integration-test:
+    name: RoboCasa365 — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboCasa365 benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocasa
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocasa:ci
+
+      - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robocasa-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e MUJOCO_GL=egl \
+            lerobot-benchmark-robocasa:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robocasa \
+                --env.type=robocasa \
+                --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robocasa \
+                --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboCasa365 artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocasa-artifacts
+          docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
+          docker rm -f robocasa-eval || true
+
+      - name: Parse RoboCasa365 eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocasa-artifacts \
+            --env robocasa \
+            --task atomic_smoke_10 \
+            --policy lerobot/smolvla_robocasa
+
+      - name: Upload RoboCasa365 rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-rollout-video
+          path: /tmp/robocasa-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCasa365 eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocasa-metrics
+          path: /tmp/robocasa-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOCEREBRA ───────────────────────────────────────────────────────────
+  # Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera
+  # defaults (image/wrist_image). The image is layered on
+  # huggingface/lerobot-gpu, which already ships [libero] as part of [all].
+  robocerebra-integration-test:
+    name: RoboCerebra — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboCerebra benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robocerebra
+          push: false
+          load: true
+          tags: lerobot-benchmark-robocerebra:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
+          cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
+
+      - name: Run RoboCerebra smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robocerebra-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e LIBERO_DATA_FOLDER=/tmp/libero_data \
+            lerobot-benchmark-robocerebra:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robocerebra \
+                --env.type=libero \
+                --env.task=libero_10 \
+                --env.fps=20 \
+                --env.obs_type=pixels_agent_pos \
+                --env.observation_height=256 \
+                --env.observation_width=256 \
+                '--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero --task libero_10 \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboCerebra artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robocerebra-artifacts
+          docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
+          docker rm -f robocerebra-eval || true
+
+      - name: Parse RoboCerebra eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robocerebra-artifacts \
+            --env robocerebra \
+            --task libero_10 \
+            --policy lerobot/smolvla_robocerebra
+
+      - name: Upload RoboCerebra rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocerebra-rollout-video
+          path: /tmp/robocerebra-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboCerebra eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robocerebra-metrics
+          path: /tmp/robocerebra-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOMME ───────────────────────────────────────────────────────────────
+  # Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy
+  # overrides (robomme can't be a pyproject extra due to numpy<2 pin).
+  robomme-integration-test:
+    name: RoboMME — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOMME_POLICY: lerobot/smolvla_robomme
+      ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build RoboMME benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robomme
+          push: false
+          load: true
+          tags: lerobot-benchmark-robomme:ci
+
+      - name: Run RoboMME smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robomme-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e ROBOMME_POLICY="${ROBOMME_POLICY}" \
+            -e ROBOMME_TASKS="${ROBOMME_TASKS}" \
+            lerobot-benchmark-robomme:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=\"\$ROBOMME_POLICY\" \
+                --env.type=robomme \
+                --env.task=\"\$ROBOMME_TASKS\" \
+                --env.dataset_split=test \
+                --env.task_ids=[0] \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=3 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robomme --task \"\$ROBOMME_TASKS\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboMME artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robomme-artifacts
+          docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
+          docker rm -f robomme-eval || true
+
+      - name: Parse RoboMME eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robomme-artifacts \
+            --env robomme \
+            --task "${ROBOMME_TASKS}" \
+            --policy "${ROBOMME_POLICY}"
+
+      - name: Upload RoboMME rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-rollout-video
+          path: /tmp/robomme-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboMME eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-metrics
+          path: /tmp/robomme-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── LIBERO-plus ───────────────────────────────────────────────────────────
+  # Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of
+  # huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus).
+  libero-plus-integration-test:
+    name: LIBERO-plus — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      LIBERO_PLUS_SUITE: libero_spatial
+      LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus
+      LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]"
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build LIBERO-plus benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.libero_plus
+          push: false
+          load: true
+          tags: lerobot-benchmark-libero-plus:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
+          cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
+
+      - name: Run LIBERO-plus smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name libero-plus-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \
+            -e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \
+            -e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \
+            lerobot-benchmark-libero-plus:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=\"\$LIBERO_PLUS_POLICY\" \
+                --env.type=libero_plus \
+                --env.task=\"\$LIBERO_PLUS_SUITE\" \
+                --env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy LIBERO-plus artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/libero-plus-artifacts
+          docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
+          docker rm -f libero-plus-eval || true
+
+      - name: Parse LIBERO-plus eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/libero-plus-artifacts \
+            --env libero_plus \
+            --task "${LIBERO_PLUS_SUITE}" \
+            --policy "${LIBERO_PLUS_POLICY}"
+
+      - name: Upload LIBERO-plus rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-rollout-video
+          path: /tmp/libero-plus-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload LIBERO-plus eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-metrics
+          path: /tmp/libero-plus-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── VLABENCH ─────────────────────────────────────────────────────────────
+  # Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain)
+  vlabench-integration-test:
+    name: VLABench — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Login to Docker Hub
+        if: ${{ env.DOCKERHUB_USERNAME != '' }}
+        uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+      - name: Build VLABench benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.vlabench
+          push: false
+          load: true
+          tags: lerobot-benchmark-vlabench:ci
+          build-args: |
+            VLABENCH_ASSETS_REPO=lerobot/vlabench-assets
+
+      - name: Run VLABench smoke eval (10 tasks, 1 episode each)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name vlabench-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            -e MUJOCO_GL=egl \
+            lerobot-benchmark-vlabench:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_vlabench \
+                --env.type=vlabench \
+                --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env vlabench \
+                --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy VLABench artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/vlabench-artifacts
+          docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true
+          docker rm -f vlabench-eval || true
+
+      - name: Parse VLABench eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/vlabench-artifacts \
+            --env vlabench \
+            --task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+            --policy lerobot/smolvla_vlabench
+
+      - name: Upload VLABench rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: vlabench-rollout-video
+          path: /tmp/vlabench-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload VLABench eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: vlabench-metrics
+          path: /tmp/vlabench-artifacts/metrics.json
+          if-no-files-found: warn
@@ -0,0 +1,237 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Model Profiling
+
+on:
+  schedule:
+    - cron: "0 0 * * 0"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/model_profiling.yml
+      - src/lerobot/configs/train.py
+      - src/lerobot/scripts/lerobot_train.py
+      - src/lerobot/utils/model_profiling.py
+      - tests/test_model_profiling.py
+  workflow_dispatch:
+    inputs:
+      git_ref:
+        description: Git ref to profile when no commit SHA is provided
+        required: false
+        type: string
+        default: main
+      git_commit:
+        description: Optional exact commit SHA to profile
+        required: false
+        type: string
+        default: ""
+      policies:
+        description: Optional comma-separated policy filter
+        required: false
+        type: string
+        default: ""
+      profile_mode:
+        description: Torch profiler mode
+        required: false
+        type: choice
+        options:
+          - trace
+          - summary
+        default: trace
+      publish_results:
+        description: Publish results to the profiling dataset when a Hub token is available
+        required: false
+        type: boolean
+        default: true
+      results_repo:
+        description: Dataset repo name or fully qualified repo id
+        required: false
+        type: string
+        default: model-profiling-history
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.inputs.git_commit || github.event.inputs.git_ref || github.ref_name || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  profile-models:
+    name: Weekly Model Profiling
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      PROFILE_MODE: ${{ github.event_name == 'pull_request' && 'summary' || github.event.inputs.profile_mode || 'trace' }}
+      POLICY_FILTER: ${{ github.event_name == 'pull_request' && 'act,diffusion,pi0,pi05,smolvla,groot,xvla,wall_x' || github.event.inputs.policies || '' }}
+      RESULTS_REPO: ${{ github.event.inputs.results_repo || 'model-profiling-history' }}
+      SHOULD_PUBLISH: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_results == 'true') }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+          ref: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.event.inputs.git_ref || 'main' }}
+
+      - name: Pull GPU image
+        run: docker pull huggingface/lerobot-gpu:latest
+
+      - name: Run model profiling
+        env:
+          HOST_GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.sha }}
+          PROFILE_GIT_REF: ${{ github.head_ref || github.ref_name || github.event.inputs.git_ref || 'main' }}
+          PROFILE_PR_NUMBER: ${{ github.event.pull_request.number || '' }}
+        run: |
+          set -eux
+          mkdir -p profiling-results
+          docker run --rm --gpus all \
+            --user "$(id -u):$(id -g)" \
+            --shm-size=16g \
+            -e HOME=/tmp/lerobot-home \
+            -e HF_HOME=/tmp/hf \
+            -e HF_LEROBOT_HOME=/tmp/hf-lerobot \
+            -e TORCH_HOME=/tmp/torch-home \
+            -e TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor-cache \
+            -e UV_PROJECT_ENVIRONMENT=/tmp/lerobot-venv \
+            -e UV_CACHE_DIR=/tmp/uv-cache \
+            -e UV_PYTHON_PREFERENCE=only-system \
+            -e XDG_DATA_HOME=/tmp/xdg-data \
+            -e XDG_CACHE_HOME=/tmp/xdg-cache \
+            -e HOST_GIT_COMMIT="${HOST_GIT_COMMIT}" \
+            -e PROFILE_GIT_REF="${PROFILE_GIT_REF}" \
+            -e PROFILE_PR_NUMBER="${PROFILE_PR_NUMBER}" \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_TOKEN="${HF_USER_TOKEN}" \
+            -e PROFILE_MODE="${PROFILE_MODE}" \
+            -e POLICY_FILTER="${POLICY_FILTER}" \
+            -e RESULTS_REPO="${RESULTS_REPO}" \
+            -e SHOULD_PUBLISH="${SHOULD_PUBLISH}" \
+            -v "${GITHUB_WORKSPACE}:/workspace" \
+            -w /workspace \
+            huggingface/lerobot-gpu:latest \
+            bash -c '
+              set -euxo pipefail
+              mkdir -p "${HOME}" "${HF_HOME}" "${HF_LEROBOT_HOME}" "${TORCH_HOME}" "${UV_CACHE_DIR}" "${XDG_CACHE_HOME}" "${XDG_DATA_HOME}" "${TORCHINDUCTOR_CACHE_DIR}"
+              rm -rf /tmp/lerobot-src
+              cp -a /workspace/. /tmp/lerobot-src
+              cd /tmp/lerobot-src
+
+              if [[ -n "${HF_USER_TOKEN:-}" ]]; then
+                hf auth login --token "${HF_USER_TOKEN}" --add-to-git-credential 2>/dev/null || true
+              fi
+
+              policies_to_run=()
+              if [[ -n "${POLICY_FILTER}" ]]; then
+                IFS="," read -ra policies_to_run <<< "${POLICY_FILTER}"
+              else
+                policies_to_run=(act diffusion groot multi_task_dit pi0 pi0_fast pi05 smolvla wall_x xvla)
+              fi
+
+              policy_extras() {
+                case "$1" in
+                  act) ;;
+                  diffusion) echo "diffusion" ;;
+                  groot) echo "groot" ;;
+                  multi_task_dit) echo "multi_task_dit" ;;
+                  pi0|pi0_fast|pi05) echo "pi" ;;
+                  smolvla) echo "smolvla" ;;
+                  wall_x) echo "wallx" ;;
+                  xvla) echo "xvla" ;;
+                  *)
+                    echo "Unknown profiling policy $1" >&2
+                    return 1
+                    ;;
+                esac
+              }
+
+              # Policies whose dep-install may fail due to environment constraints
+              # (e.g. groot requires compiling flash-attn, which needs nvcc; the CI
+              # image only ships the CUDA runtime). Install failures for these are
+              # logged as warnings and do not fail the job. See the TODO next to
+              # `lerobot[groot]` in pyproject.toml.
+              is_install_failure_tolerated() {
+                case "$1" in
+                  groot) return 0 ;;
+                  *) return 1 ;;
+                esac
+              }
+
+              overall_status=0
+              for raw_policy in "${policies_to_run[@]}"; do
+                policy="$(echo "${raw_policy}" | xargs)"
+                [[ -z "${policy}" ]] && continue
+
+                echo "::group::Profile ${policy}"
+
+                extra="$(policy_extras "${policy}")" || { overall_status=1; echo "::endgroup::"; continue; }
+
+                # Fresh, isolated dependency resolution per policy so that
+                # incompatible extras (e.g. flash-attn for groot) never block
+                # the rest of the matrix.
+                sync_cmd=(uv sync --locked --extra training --extra test)
+                if [[ -n "${extra}" ]]; then
+                  sync_cmd+=(--extra "${extra}")
+                fi
+                # flash-attn does not declare torch as a build-time dep, so its
+                # isolated build env fails with ModuleNotFoundError. Torch is a
+                # core lerobot dep and is already resolved here, so we disable
+                # build isolation for flash-attn specifically.
+                sync_cmd+=(--no-build-isolation-package flash-attn)
+                if ! "${sync_cmd[@]}"; then
+                  if is_install_failure_tolerated "${policy}"; then
+                    echo "::warning::Dependency install failed for ${policy} (known-fragile); skipping."
+                  else
+                    echo "Dependency install failed for ${policy}; skipping." >&2
+                    overall_status=1
+                  fi
+                  echo "::endgroup::"
+                  continue
+                fi
+
+                cmd=(
+                  uv run python -m lerobot.utils.model_profiling
+                  --output_dir=/workspace/profiling-results
+                  --hub_org=lerobot
+                  --results_repo="${RESULTS_REPO}"
+                  --profile_mode="${PROFILE_MODE}"
+                  --git_commit="${HOST_GIT_COMMIT}"
+                  --git_ref="${PROFILE_GIT_REF}"
+                  --pr_number="${PROFILE_PR_NUMBER}"
+                  --policies "${policy}"
+                )
+                if [[ "${SHOULD_PUBLISH}" == "true" && -n "${HF_USER_TOKEN:-}" ]]; then
+                  cmd+=(--publish)
+                fi
+
+                if ! "${cmd[@]}"; then
+                  echo "Profiling failed for ${policy}." >&2
+                  overall_status=1
+                fi
+
+                echo "::endgroup::"
+              done
+
+              exit "${overall_status}"
+            '
+
+      - name: Upload profiling artifacts
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: model-profiling-results
+          path: profiling-results
+          if-no-files-found: warn
@@ -0,0 +1,84 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for LIBERO-plus integration tests.
+# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus
+# fork source + its 6.4 GB perturbation assets.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus .
+# Run:    docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+ENV MUJOCO_GL=egl
+
+# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras
+# (wand / ImageMagick / fontconfig) not in the nightly base.
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in
+# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py.
+# We install these explicitly instead of via the [libero_plus] extra because
+# the extra's `libero @ git+...` dep installs as a namespace package and then
+# clone and PYTHONPATH-override it below.
+RUN uv pip install --no-cache \
+        "robosuite==1.4.1" \
+        "bddl==1.0.1" \
+        "easydict==1.13" \
+        "mujoco==3.7.0" \
+        "matplotlib==3.10.8" \
+        "Wand==0.6.13" \
+        "scikit-image==0.25.2" \
+        "gym==0.26.2"
+
+# Clone LIBERO-plus and make it importable as `libero`. The nightly base has
+# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so
+# Python resolves `import libero` to the 2402-task LIBERO-plus module instead.
+# Pinned to the current upstream main SHA so benchmark builds stay reproducible.
+ARG LIBERO_PLUS_SHA=4976dc3
+ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
+RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
+    && git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \
+    && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
+    && (uv pip uninstall hf-libero 2>/dev/null || true)
+ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
+
+# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via
+# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All
+# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's
+# assets.zip (6.4 GB) under a deep author-internal prefix — extract and
+# flatten it under ${LIBERO_PLUS_ROOT}/assets.
+RUN python -c "\
+from huggingface_hub import hf_hub_download; \
+hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
+                filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
+    && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
+    && ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \
+    && mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \
+    && rm -rf /tmp/libero-plus-dl
+
+# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are
+# non-interactive (it calls input() when the config is missing).
+RUN mkdir -p /home/user_lerobot/.libero \
+    && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
+       > /home/user_lerobot/.libero/config.yaml
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,71 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCasa365 integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code and RoboCasa-specific asset setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install robocasa + robosuite as editable clones. pip-installing from git
+# omits data files like robocasa/models/assets/box_links/box_links_assets.json
+# (not declared in package_data), which download_kitchen_assets needs at import.
+#
+# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
+# in install_requires, which would shadow the editable lerobot baked into
+# this image. We install robocasa's actual runtime deps explicitly instead.
+# Pinned SHAs for reproducible benchmark runs. Bump when you need an
+# upstream fix; don't rely on `main`/`master` drift.
+ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
+ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
+RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
+    git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
+    git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
+    git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
+    uv pip install --no-cache -e ~/robocasa --no-deps && \
+    uv pip install --no-cache -e ~/robosuite && \
+    uv pip install --no-cache \
+      "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
+      "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
+      "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
+      "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
+      "tianshou==0.4.10" "gymnasium==1.2.3"
+
+# Set up robocasa macros and download kitchen assets. We need:
+#   - tex              : base environment textures
+#   - tex_generative   : AI-generated textures; kitchen fixture XMLs embed
+#                        refs to generative_textures/wall/tex*.png
+#                        unconditionally, so MjModel.from_xml_string fails
+#                        at reset time without them (even if the env is
+#                        constructed with generative_textures=None).
+#   - fixtures_lw      : lightwheel kitchen fixtures (fridge, counters...)
+#   - objs_lw          : lightwheel object meshes (stools, misc props)
+# We skip the objaverse/aigen object packs (~30GB combined) by pairing
+# this with --env.obj_registries=["lightwheel"] on the lerobot side.
+# The download script prompts interactively, so pipe 'y' to auto-accept.
+RUN python -m robocasa.scripts.setup_macros && \
+    yes y | python -m robocasa.scripts.download_kitchen_assets \
+      --type tex tex_generative fixtures_lw objs_lw
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
+# replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,43 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCerebra integration tests.
+# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different
+# rename_map, so this image is identical to the LIBERO benchmark image —
+# extends the nightly GPU base with LIBERO assets + the PR's source code.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
+# runtime (which times out on CI). Point the libero config at the cached path.
+# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
+# so we write the config before any libero import can happen.
+RUN LIBERO_DIR=$(python -c \
+      "import importlib.util, os; s=importlib.util.find_spec('libero'); \
+       print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
+    mkdir -p /home/user_lerobot/.libero && \
+    python -c "\
+from huggingface_hub import snapshot_download; \
+snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
+                  local_dir='/home/user_lerobot/.libero/assets')" && \
+    printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
+    > /home/user_lerobot/.libero/config.yaml
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,56 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboMME integration tests.
+# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system
+# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all]
+# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which
+# conflict with lerobot's defaults; both are safe at runtime:
+#   - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26)
+#   - numpy 1.26.4 is API-compatible with lerobot's actual usage.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme .
+# Run:    docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering.
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
+
+# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image.
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         libvulkan1 libvulkan-dev mesa-vulkan-drivers \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top
+# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its
+# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml).
+COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
+RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
+    && uv pip install --no-cache --override /tmp/robomme_override.txt \
+         -e ".[smolvla,av-dep]" \
+         "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
+    && python -c "import robomme; print('robomme import OK')"
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,122 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboTwin 2.0 integration tests.
+# Extends the nightly GPU image with the RoboTwin simulator stack:
+#   sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
+# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
+#
+# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
+# Run:   docker run --gpus all --rm lerobot-benchmark-robotwin \
+#            lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
+
+FROM huggingface/lerobot-gpu:latest
+
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
+    ROBOTWIN_ROOT=/opt/robotwin
+
+# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
+# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
+USER root
+# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
+# an upstream fix; don't rely on `main` drift.
+ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
+         libvulkan1 vulkan-tools \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
+    && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+# RoboTwin runtime deps (av is already in the base via [av-dep]).
+RUN uv pip install --no-cache \
+        "sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
+        "open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
+
+# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
+RUN uv pip install --no-cache --no-build-isolation \
+        "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+
+# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
+# build aborts on an empty arch list. Pinned SHA for reproducibility.
+ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a
+RUN cd ${ROBOTWIN_ROOT}/envs \
+    && git clone https://github.com/NVlabs/curobo.git \
+    && git -C curobo checkout ${CUROBO_SHA} \
+    && cd curobo \
+    && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
+       uv pip install -e . --no-build-isolation --no-cache
+
+# Upstream patches (mirror RoboTwin's script/_install.sh).
+# These patches target the exact versions pinned above; re-check when upgrading.
+# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
+#   Safe to remove once mplib > 0.2.1 ships with the fix upstream.
+# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
+#   Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
+RUN python - <<'EOF'
+import pathlib, re, site
+for d in site.getsitepackages():
+    p = pathlib.Path(d) / "mplib" / "planner.py"
+    if p.exists():
+        p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
+        print(f"mplib patch applied: {p}")
+    p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
+    if p.exists():
+        src = p.read_text().replace(
+            "with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
+        ).replace('"srdf"', '".srdf"')
+        p.write_text(src)
+        print(f"sapien patch applied: {p}")
+EOF
+
+# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
+# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
+# The dataset is public — no auth token needed.
+RUN python - <<'EOF'
+import os, pathlib, zipfile
+from huggingface_hub import hf_hub_download
+
+assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
+assets_dir.mkdir(parents=True, exist_ok=True)
+for fname in ("embodiments.zip", "objects.zip"):
+    local = hf_hub_download(
+        repo_id="TianxingChen/RoboTwin2.0",
+        repo_type="dataset",
+        filename=fname,
+        local_dir=str(assets_dir),
+    )
+    with zipfile.ZipFile(local, "r") as z:
+        z.extractall(str(assets_dir))
+    pathlib.Path(local).unlink()
+EOF
+
+WORKDIR ${ROBOTWIN_ROOT}
+RUN python script/update_embodiment_config_path.py
+
+ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}"
+
+# Return to the lerobot source directory (set by base image) before overlaying.
+WORKDIR /lerobot
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for VLABench integration tests.
+# Extends the nightly GPU image with the PR's source code and VLABench setup.
+#
+# Build:  docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench .
+# Run:    docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control.
+# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI).
+# Editable install (-e) because VLABench/utils/ has no __init__.py, so
+# find_packages() omits it from wheels; editable mode uses the source tree directly.
+# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py).
+# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import
+# time. Guard the call so missing dirs return [] instead of crashing (in case
+# the asset download is partial).
+#
+# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need
+# an upstream fix; don't rely on `main`/`develop` drift.
+ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017
+ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85
+RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \
+    git -C ~/VLABench checkout ${VLABENCH_SHA} && \
+    git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \
+    git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \
+    python3 -c "\
+import pathlib; \
+p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \
+t = p.read_text(); \
+p.write_text(t.replace( \
+    'subdirs = os.listdir(xml_dir)', \
+    'if not os.path.isdir(xml_dir): return []\n    subdirs = os.listdir(xml_dir)'))" && \
+    uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \
+      mujoco==3.2.2 dm-control==1.0.22 \
+      open3d colorlog scikit-learn openai gdown
+
+# Download VLABench mesh assets. Task configs reference object meshes
+# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without
+# them the task builder picks from an empty mesh list and crashes with
+# IndexError at task-build time (random.choice([]) in config_manager.py).
+#
+# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time
+# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll
+# snapshot_download the repo into VLABench's assets dir. This is the reliable
+# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have
+# viewed or downloaded this file recently") on shared academic files.
+#
+# After download we *validate* that at least one XML exists under each
+# task-critical subtree and fail the build loudly if not. Silent-empty asset
+# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them
+# here rather than after a 10-minute eval build.
+#
+# Fallback: VLABench's own gdown-based script. Best-effort only.
+ARG VLABENCH_ASSETS_REPO=""
+RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \
+    if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \
+        echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \
+        uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \
+        python -c "from huggingface_hub import snapshot_download; \
+p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \
+    local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \
+print('snapshot_download returned:', p)"; \
+    else \
+        echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \
+        python ~/VLABench/scripts/download_assets.py --choice all; \
+    fi && \
+    python -c "\
+from pathlib import Path; \
+import sys; \
+root = Path('${ASSETS_DIR}'); \
+checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \
+failed = []; \
+print(f'Validating VLABench assets under {root}'); \
+[print(f'  {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \
+[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \
+sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')"
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with VLABenchEnv registration
+# and updated obs handling) replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
@@ -77,10 +77,22 @@
    title: Adding a New Benchmark
  - local: libero
    title: LIBERO
+  - local: libero_plus
+    title: LIBERO-plus
  - local: metaworld
    title: Meta-World
+  - local: robotwin
+    title: RoboTwin 2.0
+  - local: robocasa
+    title: RoboCasa365
+  - local: robocerebra
+    title: RoboCerebra
+  - local: robomme
+    title: RoboMME
  - local: envhub_isaaclab_arena
    title: NVIDIA IsaacLab Arena Environments
+  - local: vlabench
+    title: VLABench
  title: "Benchmarks"
 - sections:
  - local: introduction_processors
@@ -0,0 +1,188 @@
+# LIBERO-plus
+
+LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss.
+
+- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626)
+- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus)
+- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
+
+![An overview of the LIBERO-plus benchmark perturbation dimensions](https://github.com/sylvestf/LIBERO-plus/raw/main/static/images/libero-plus.jpg)
+
+## Perturbation dimensions
+
+LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes:
+
+| Dimension             | What changes                                          |
+| --------------------- | ----------------------------------------------------- |
+| Objects layout        | Target position, presence of confounding objects      |
+| Camera viewpoints     | Camera position, orientation, field-of-view           |
+| Robot initial states  | Manipulator start pose                                |
+| Language instructions | LLM-rewritten task description (paraphrase / synonym) |
+| Light conditions      | Intensity, direction, color, shadow                   |
+| Background textures   | Scene surface and object appearance                   |
+| Sensor noise          | Photometric distortions and image degradation         |
+
+## Available task suites
+
+LIBERO-plus covers the same five suites as LIBERO:
+
+| Suite          | CLI name         | Tasks | Max steps | Description                                        |
+| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- |
+| LIBERO-Spatial | `libero_spatial` | 10    | 280       | Tasks requiring reasoning about spatial relations  |
+| LIBERO-Object  | `libero_object`  | 10    | 280       | Tasks centered on manipulating different objects   |
+| LIBERO-Goal    | `libero_goal`    | 10    | 300       | Goal-conditioned tasks with changing targets       |
+| LIBERO-90      | `libero_90`      | 90    | 400       | Short-horizon tasks from the LIBERO-100 collection |
+| LIBERO-Long    | `libero_10`      | 10    | 520       | Long-horizon tasks from the LIBERO-100 collection  |
+
+<Tip warning={true}>
+  Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero`
+  so that `import libero` resolves to the LIBERO-plus fork. You cannot have both
+  installed at the same time. To switch back to vanilla LIBERO, uninstall the
+  fork and reinstall with `pip install -e ".[libero]"`.
+</Tip>
+
+## Installation
+
+### System dependencies (Linux only)
+
+```bash
+sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev
+```
+
+### Python package
+
+```bash
+pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym
+git clone https://github.com/sylvestf/LIBERO-plus.git
+cd LIBERO-plus && pip install --no-deps -e .
+pip uninstall -y hf-libero  # so `import libero` resolves to the fork
+```
+
+LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported.
+
+<Tip>
+Set the MuJoCo rendering backend before running evaluation:
+
+```bash
+export MUJOCO_GL=egl   # headless / HPC / cloud
+```
+
+</Tip>
+
+### Download LIBERO-plus assets
+
+LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory:
+
+```bash
+# After installing the package, find where it was installed:
+python -c "import libero; print(libero.__file__)"
+# Then extract assets.zip into <package_root>/libero/assets/
+```
+
+## Evaluation
+
+### Default evaluation (recommended)
+
+Evaluate across the four standard suites (10 episodes per task):
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial,libero_object,libero_goal,libero_10 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --env.max_parallel_tasks=1
+```
+
+### Single-suite evaluation
+
+Evaluate on one LIBERO-plus suite:
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.).
+- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
+- `--eval.batch_size` controls how many environments run in parallel.
+- `--eval.n_episodes` sets how many episodes to run per task.
+
+### Multi-suite evaluation
+
+Benchmark a policy across multiple suites at once by passing a comma-separated list:
+
+```bash
+lerobot-eval \
+  --policy.path="your-policy-id" \
+  --env.type=libero_plus \
+  --env.task=libero_spatial,libero_object \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+### Control mode
+
+LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
+
+```bash
+--env.control_mode=relative   # or "absolute"
+```
+
+### Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
+- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
+- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
+
+### Recommended evaluation episodes
+
+For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
+
+## Training
+
+### Dataset
+
+A LeRobot-format training dataset for LIBERO-plus is available at:
+
+- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
+
+### Example training command
+
+```bash
+lerobot-train \
+    --policy.type=smolvla \
+    --policy.repo_id=${HF_USER}/smolvla_libero_plus \
+    --policy.load_vlm_weights=true \
+    --dataset.repo_id=lerobot/libero_plus \
+    --env.type=libero_plus \
+    --env.task=libero_spatial \
+    --output_dir=./outputs/ \
+    --steps=100000 \
+    --batch_size=4 \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1 \
+    --eval_freq=1000
+```
+
+## Relationship to LIBERO
+
+LIBERO-plus is a drop-in extension of LIBERO:
+
+- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`)
+- Same camera names and observation/action format
+- Same task suite names
+- Installs under the same `libero` Python package name (different GitHub repo)
+
+To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`.
@@ -0,0 +1,188 @@
+# RoboCasa365
+
+[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
+
+- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
+- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
+- Project website: [robocasa.ai](https://robocasa.ai)
+- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
+- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
+
+<img
+  src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
+  alt="RoboCasa365 benchmark overview"
+  width="85%"
+/>
+
+## Available tasks
+
+RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
+
+| Family    | Tasks | Description                                                                     |
+| --------- | ----- | ------------------------------------------------------------------------------- |
+| Atomic    | ~65   | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
+| Composite | ~300  | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc.     |
+
+**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
+
+**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
+
+`--env.task` accepts three forms:
+
+- a single task name (`CloseFridge`)
+- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
+- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
+
+## Installation
+
+RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/robocasa/robocasa.git ~/robocasa
+git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
+pip install -e ~/robocasa --no-deps
+pip install -e ~/robosuite
+
+# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
+# the bad lerobot pin).
+pip install numpy numba scipy mujoco pygame Pillow opencv-python \
+            pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
+            tianshou gymnasium
+
+python -m robocasa.scripts.setup_macros
+# Lightweight assets (lightwheel object meshes + textures). Enough for
+# the default env out of the box.
+python -m robocasa.scripts.download_kitchen_assets \
+  --type tex tex_generative fixtures_lw objs_lw
+# Optional: full objaverse/aigen registries (~30GB) for richer object
+# variety. Enable at eval time via --env.obj_registries (see below).
+# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
+```
+
+<Tip>
+RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+### Object registries
+
+By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
+
+```bash
+--env.obj_registries='[objaverse,lightwheel]'
+```
+
+## Evaluation
+
+All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Benchmark-group evaluation
+
+Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocasa \
+  --env.type=robocasa \
+  --env.task=atomic_seen \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
+
+## Policy inputs and outputs
+
+**Observations** (raw RoboCasa camera names are preserved verbatim):
+
+- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
+- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
+- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
+- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
+
+## Training
+
+### Single-task example
+
+A ready-to-use single-task dataset is on the Hub:
+[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
+
+Fine-tune a SmolVLA base on `CloseFridge`:
+
+```bash
+lerobot-train \
+  --policy.type=smolvla \
+  --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --policy.load_vlm_weights=true \
+  --policy.push_to_hub=true \
+  --dataset.repo_id=pepijn223/robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --output_dir=./outputs/smolvla_robocasa_CloseFridge \
+  --steps=100000 \
+  --batch_size=4 \
+  --eval_freq=5000 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5 \
+  --save_freq=10000
+```
+
+Evaluate the resulting checkpoint:
+
+```bash
+lerobot-eval \
+  --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
+  --env.type=robocasa \
+  --env.task=CloseFridge \
+  --eval.batch_size=1 \
+  --eval.n_episodes=20
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
@@ -0,0 +1,99 @@
+# RoboCerebra
+
+[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF).
+
+- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677)
+- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/)
+- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks.
+- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra)
+
+## Available tasks
+
+RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite:
+
+| Suite     | CLI name    | Tasks | Description                                                   |
+| --------- | ----------- | ----- | ------------------------------------------------------------- |
+| LIBERO-10 | `libero_10` | 10    | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals |
+
+Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals.
+
+## Installation
+
+RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need:
+
+```bash
+pip install -e ".[libero]"
+```
+
+<Tip>
+RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+## Evaluation
+
+RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_robocerebra \
+  --env.type=libero \
+  --env.task=libero_10 \
+  --env.fps=20 \
+  --env.obs_type=pixels_agent_pos \
+  --env.observation_height=256 \
+  --env.observation_width=256 \
+  '--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \
+  --policy.empty_cameras=1
+```
+
+### Recommended evaluation episodes
+
+**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper.
+
+## Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper)
+- `observation.images.image` — third-person view, 256×256 HWC uint8
+- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D)
+
+## Training
+
+The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations:
+
+| Feature                          | Shape         | Description          |
+| -------------------------------- | ------------- | -------------------- |
+| `observation.images.image`       | (256, 256, 3) | Third-person view    |
+| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera |
+| `observation.state`              | (8,)          | Joint pos + gripper  |
+| `action`                         | (7,)          | EEF delta + gripper  |
+
+Fine-tune a SmolVLA base on it:
+
+```bash
+lerobot-train \
+  --policy.path=lerobot/smolvla_base \
+  --dataset.repo_id=lerobot/robocerebra_unified \
+  --env.type=libero \
+  --env.task=libero_10 \
+  --output_dir=outputs/smolvla_robocerebra
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.
@@ -0,0 +1,130 @@
+# RoboMME
+
+[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation.
+
+- **16 tasks** across 4 memory-skill suites
+- **1,600 training demos** (100 per task, 50 val, 50 test)
+- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps
+- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only
+
+![RoboMME benchmark tasks overview](https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2603.04639/gradient.png)
+
+## Tasks
+
+| Suite                             | Tasks                                                         |
+| --------------------------------- | ------------------------------------------------------------- |
+| **Counting** (temporal memory)    | BinFill, PickXtimes, SwingXtimes, StopCube                    |
+| **Permanence** (spatial memory)   | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap  |
+| **Reference** (object memory)     | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder |
+| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick                  |
+
+## Installation
+
+> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts.
+
+### Native (Linux)
+
+```bash
+pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \
+  -e '.[smolvla,av-dep]' \
+  'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main'
+```
+
+> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely.
+
+### Docker (recommended)
+
+```bash
+# Build base image first (from repo root)
+docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base .
+
+# Build RoboMME eval image (applies gymnasium + numpy pin overrides)
+docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme .
+```
+
+The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage.
+
+## Running Evaluation
+
+### Default (single task, single episode)
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_repo> \
+    --env.type=robomme \
+    --env.task=PickXtimes \
+    --env.dataset_split=test \
+    --env.task_ids=[0] \
+    --eval.batch_size=1 \
+    --eval.n_episodes=1
+```
+
+### Multi-task evaluation
+
+Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split.
+
+```bash
+lerobot-eval \
+    --policy.path=<your_policy_repo> \
+    --env.type=robomme \
+    --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+    --env.dataset_split=test \
+    --env.task_ids=[0,1,2,3,4,5,6,7,8,9] \
+    --eval.batch_size=1 \
+    --eval.n_episodes=50
+```
+
+### Key CLI options for `env.type=robomme`
+
+| Option               | Default       | Description                                        |
+| -------------------- | ------------- | -------------------------------------------------- |
+| `env.task`           | `PickXtimes`  | Any of the 16 task names above (comma-separated)   |
+| `env.dataset_split`  | `test`        | `train`, `val`, or `test`                          |
+| `env.action_space`   | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D)             |
+| `env.episode_length` | `300`         | Max steps per episode                              |
+| `env.task_ids`       | `null`        | List of episode indices to evaluate (null = `[0]`) |
+
+## Dataset
+
+The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly:
+
+```python
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+dataset = LeRobotDataset("lerobot/robomme")
+```
+
+### Dataset features
+
+| Feature            | Shape         | Description                     |
+| ------------------ | ------------- | ------------------------------- |
+| `image`            | (256, 256, 3) | Front camera RGB                |
+| `wrist_image`      | (256, 256, 3) | Wrist camera RGB                |
+| `actions`          | (8,)          | Joint angles + gripper          |
+| `state`            | (8,)          | Joint positions + gripper state |
+| `simple_subgoal`   | str           | High-level language annotation  |
+| `grounded_subgoal` | str           | Grounded language annotation    |
+| `episode_index`    | int           | Episode ID                      |
+| `frame_index`      | int           | Frame within episode            |
+
+### Feature key alignment (training)
+
+The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`.
+
+The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning.
+
+## Action Spaces
+
+| Type          | Dim | Description                                               |
+| ------------- | --- | --------------------------------------------------------- |
+| `joint_angle` | 8   | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) |
+| `ee_pose`     | 7   | xyz + roll/pitch/yaw + gripper                            |
+
+Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`.
+
+## Platform Notes
+
+- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported.
+- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed.
+- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically.
+- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package.
@@ -0,0 +1,223 @@
+# RoboTwin 2.0
+
+RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
+
+- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
+- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
+- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
+- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
+
+![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png)
+
+## Overview
+
+| Property      | Value                                                    |
+| ------------- | -------------------------------------------------------- |
+| Tasks         | 50 dual-arm manipulation tasks                           |
+| Robot         | Aloha-AgileX bimanual (14 DOF, 7 per arm)                |
+| Action space  | 14-dim joint-space, continuous in `[-1, 1]`              |
+| Cameras       | `head_camera`, `left_camera`, `right_camera`             |
+| Simulator     | SAPIEN (not MuJoCo)                                      |
+| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations          |
+| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
+
+## Available tasks
+
+RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
+
+| Task                     | CLI name                 | Category          |
+| ------------------------ | ------------------------ | ----------------- |
+| Beat block with hammer   | `beat_block_hammer`      | Tool use          |
+| Click bell / alarm clock | `click_bell`             | Precision press   |
+| Stack blocks (2 / 3)     | `stack_blocks_two/three` | Stacking          |
+| Stack bowls (2 / 3)      | `stack_bowls_two/three`  | Stacking          |
+| Handover block / mic     | `handover_block`         | Bimanual coord.   |
+| Lift pot                 | `lift_pot`               | Bimanual lift     |
+| Shake bottle             | `shake_bottle`           | Continuous motion |
+| Turn switch              | `turn_switch`            | Articulated obj   |
+| Stamp seal               | `stamp_seal`             | Precision place   |
+| Scan object              | `scan_object`            | Mobile manip.     |
+
+Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
+
+<Tip warning={true}>
+  `open_laptop` is currently broken upstream (its `check_success()` uses
+  `self.arm_tag`, which is only set inside the scripted-expert `play_once()`
+  path and therefore unavailable during normal policy eval). Avoid it until the
+  upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
+  `load_actors()`.
+</Tip>
+
+## Dataset
+
+The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
+
+```
+lerobot/robotwin_unified
+```
+
+It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
+
+You can load it directly with the HF Datasets library:
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("lerobot/robotwin_unified", split="train")
+```
+
+## Installation
+
+RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
+
+### 1. Create a conda environment
+
+```bash
+conda create -n robotwin python=3.10 -y
+conda activate robotwin
+```
+
+### 2. Install LeRobot
+
+```bash
+git clone https://github.com/huggingface/lerobot.git
+cd lerobot
+pip install -e "."
+```
+
+### 3. Install RoboTwin 2.0
+
+```bash
+git clone https://github.com/RoboTwin-Platform/RoboTwin.git
+cd RoboTwin
+bash script/_install.sh
+bash script/_download_assets.sh
+```
+
+The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
+
+<Tip warning={true}>
+If the automated install fails, install manually:
+
+```bash
+pip install -r requirements.txt
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
+pip install -e . --no-build-isolation
+```
+
+Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
+
+</Tip>
+
+### 4. Add RoboTwin to PYTHONPATH
+
+The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
+
+```bash
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+```
+
+Add this to your shell profile to make it permanent.
+
+## Evaluation
+
+### Standard evaluation (recommended)
+
+Evaluate a policy on a single task with the official protocol (100 episodes):
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Single-task quick check
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --eval.batch_size=1 \
+  --eval.n_episodes=5
+```
+
+### Multi-task sweep
+
+Evaluate on several tasks in one run:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+### Full benchmark (all 50 tasks)
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
+  --eval.batch_size=1 \
+  --eval.n_episodes=100
+```
+
+<Tip>
+  `open_laptop` is intentionally omitted above because of the upstream
+  `self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
+  upstream fix lands.
+</Tip>
+
+## Camera configuration
+
+By default, all three cameras are included:
+
+| Camera key     | Description                    |
+| -------------- | ------------------------------ |
+| `head_camera`  | Torso-mounted overhead view    |
+| `left_camera`  | Left arm wrist-mounted camera  |
+| `right_camera` | Right arm wrist-mounted camera |
+
+To use a subset of cameras, override `--env.camera_names`:
+
+```bash
+lerobot-eval \
+  --policy.path="your-hf-policy-id" \
+  --env.type=robotwin \
+  --env.task=beat_block_hammer \
+  --env.camera_names="head_camera,left_camera" \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10
+```
+
+## Environment config reference
+
+Key parameters for `RoboTwinEnvConfig`:
+
+| Parameter            | Default                                  | Description                        |
+| -------------------- | ---------------------------------------- | ---------------------------------- |
+| `task`               | `"beat_block_hammer"`                    | Comma-separated task name(s)       |
+| `fps`                | `25`                                     | Simulation FPS                     |
+| `episode_length`     | `300`                                    | Max steps per episode              |
+| `obs_type`           | `"pixels_agent_pos"`                     | `"pixels"` or `"pixels_agent_pos"` |
+| `camera_names`       | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras     |
+| `observation_height` | `240`                                    | Camera pixel height                |
+| `observation_width`  | `320`                                    | Camera pixel width                 |
+
+## Leaderboard submission
+
+Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
+
+- Training on 50 `demo_clean` demonstrations per task
+- Evaluating 100 episodes per task
+- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
+
+For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
@@ -0,0 +1,176 @@
+# VLABench
+
+[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below).
+
+- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194)
+- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench)
+- Project website: [vlabench.github.io](https://vlabench.github.io)
+- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench)
+
+<img
+  src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/vlabench.png"
+  alt="VLABench benchmark overview"
+  width="85%"
+/>
+
+## Available tasks
+
+VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface:
+
+| Suite     | CLI name    | Tasks | Description                                                      |
+| --------- | ----------- | ----- | ---------------------------------------------------------------- |
+| Primitive | `primitive` | 21    | Single / few-skill combinations (select, insert, physics QA)     |
+| Composite | `composite` | 22    | Multi-step reasoning and long-horizon planning (cook, rearrange) |
+
+**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`).
+
+**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`.
+
+`--env.task` accepts three forms:
+
+- a single task name (`select_fruit`)
+- a comma-separated list (`select_fruit,heat_food`)
+- a suite shortcut (`primitive`, `composite`, or `primitive,composite`)
+
+## Installation
+
+VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets:
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench
+git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms
+pip install -e ~/VLABench -e ~/rrt-algorithms
+pip install "mujoco==3.2.2" "dm-control==1.0.22" \
+            open3d colorlog scikit-learn openai gdown
+
+python ~/VLABench/scripts/download_assets.py
+```
+
+<Tip>
+VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running:
+
+```bash
+export MUJOCO_GL=egl  # for headless servers (HPC, cloud)
+```
+
+</Tip>
+
+## Evaluation
+
+All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=select_fruit \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=select_fruit,select_toy,add_condiment,heat_food \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Suite-wide evaluation
+
+Run an entire suite (all 21 primitives or all 22 composites):
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=primitive \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  --env.max_parallel_tasks=1 \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+Or both suites:
+
+```bash
+lerobot-eval \
+  --policy.path=lerobot/smolvla_vlabench \
+  --env.type=vlabench \
+  --env.task=primitive,composite \
+  --eval.batch_size=1 \
+  --eval.n_episodes=10 \
+  --eval.use_async_envs=false \
+  --policy.device=cuda \
+  --env.max_parallel_tasks=1 \
+  '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper.
+
+## Policy inputs and outputs
+
+**Observations:**
+
+- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper)
+- `observation.images.image` — front camera, 480×480 HWC uint8
+- `observation.images.second_image` — second camera, 480×480 HWC uint8
+- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper.
+
+## Training
+
+### Datasets
+
+Pre-collected VLABench datasets in LeRobot format on the Hub:
+
+- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images.
+- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images.
+
+### Example training command
+
+Fine-tune a SmolVLA base on the primitive suite:
+
+```bash
+lerobot-train \
+  --policy.type=smolvla \
+  --policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \
+  --policy.load_vlm_weights=true \
+  --policy.push_to_hub=true \
+  --dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \
+  --env.type=vlabench \
+  --env.task=select_fruit \
+  --output_dir=./outputs/smolvla_vlabench_primitive \
+  --steps=100000 \
+  --batch_size=4 \
+  --eval_freq=5000 \
+  --eval.batch_size=1 \
+  --eval.n_episodes=1 \
+  --save_freq=10000
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark.
@@ -212,6 +212,20 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
 pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
 libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
 metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
+# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
+# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
+# release), so any `vlabench>=X` pip spec is unresolvable. Install it
+# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx
+# for the recipe.
+# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
+# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
+# resolve into a single env. Install it only in the RoboMME Docker image
+# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme).
+# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
+# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
+# workspace `lerobot` and makes the graph unsolvable under any resolver
+# (uv, pip). Install it manually alongside robosuite — see
+# docs/source/robocasa.mdx for the recipe.

 # All
 all = [
@@ -31,9 +31,23 @@ from __future__ import annotations

 import argparse
 import json
+import re
 import sys
 from pathlib import Path

+# LIBERO-plus derives task.language by space-joining the perturbation-variant
+# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py),
+# so non-_language_ variants inherit a trailing metadata blob like
+# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so
+# the description matches the base instruction used in the training dataset.
+_LIBERO_PERTURBATION_TAIL_RE = re.compile(
+    r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
+)
+
+
+def _strip_libero_perturbation_tail(instruction: str) -> str:
+    return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
+

 def _libero_descriptions(task_suite: str) -> dict[str, str]:
    from libero.libero import benchmark  # type: ignore[import-untyped]
@@ -47,7 +61,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
        )
        return {}
    suite = suite_dict[task_suite]()
-    return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
+    return {
+        f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
+        for i in range(suite.n_tasks)
+    }


 def _metaworld_descriptions(task_name: str) -> dict[str, str]:
@@ -57,19 +74,120 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
    return {f"{task_name}_0": label}


+def _robotwin_descriptions(task_names: str) -> dict[str, str]:
+    """Return descriptions for each requested RoboTwin task. Reads
+    `description/task_instruction/<task>.json` from the RoboTwin clone
+    (cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
+    out: dict[str, str] = {}
+    root = Path("description/task_instruction")
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc_file = root / f"{name}.json"
+        desc = name.replace("_", " ")
+        if desc_file.is_file():
+            data = json.loads(desc_file.read_text())
+            full = data.get("full_description") or desc
+            # Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
+            desc = full.replace("<", "").replace(">", "")
+        out[f"{name}_0"] = desc
+    return out
+
+
+def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
+    """For each task in the comma-separated list, emit a cleaned-name label.
+
+    RoboCasa episodes carry their language instruction in the env's
+    `ep_meta['lang']`, populated per reset. Pulling it requires spinning
+    up the full kitchen env per task (~seconds each); we use the task
+    name as the key here and let the eval's episode info carry the
+    actual instruction.
+    """
+    out: dict[str, str] = {}
+    for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+        # Split CamelCase into words: "CloseFridge" → "close fridge".
+        label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
+        out[f"{task}_0"] = label or task
+    return out
+
+
+_ROBOMME_DESCRIPTIONS = {
+    "BinFill": "Fill the target bin with the correct number of cubes",
+    "PickXtimes": "Pick the indicated cube the specified number of times",
+    "SwingXtimes": "Swing the object the specified number of times",
+    "StopCube": "Grasp and stop the moving cube",
+    "VideoUnmask": "Pick the cube shown in the reference video",
+    "VideoUnmaskSwap": "Pick the cube matching the reference video after a swap",
+    "ButtonUnmask": "Press the button indicated by the reference",
+    "ButtonUnmaskSwap": "Press the correct button after objects are swapped",
+    "PickHighlight": "Pick the highlighted cube",
+    "VideoRepick": "Repick the cube shown in the reference video",
+    "VideoPlaceButton": "Place the cube on the button shown in the video",
+    "VideoPlaceOrder": "Place cubes in the order shown in the video",
+    "MoveCube": "Move the cube to the target location",
+    "InsertPeg": "Insert the peg into the target hole",
+    "PatternLock": "Unlock the pattern by pressing buttons in sequence",
+    "RouteStick": "Route the stick through the required waypoints",
+}
+
+
+def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]:
+    """Return descriptions for each requested RoboMME task. Keys match the
+    video filename pattern `<task>_<task_id>` used by the eval script."""
+    if task_ids is None:
+        task_ids = [0]
+    out: dict[str, str] = {}
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc = _ROBOMME_DESCRIPTIONS.get(name, name)
+        for tid in task_ids:
+            out[f"{name}_{tid}"] = desc
+    return out
+
+
+def _vlabench_descriptions(task_spec: str) -> dict[str, str]:
+    """For each task in the comma-separated list, emit a cleaned-name label.
+
+    VLABench tasks carry language instructions on their dm_control task
+    object, but pulling them requires loading the full env per task
+    (~seconds each). The CI smoke-eval already captures the instruction
+    inside its episode info; this mapping is just enough to key
+    `metrics.json` by `<task>_0`.
+    """
+    out: dict[str, str] = {}
+    for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+        out[f"{task}_0"] = task.replace("_", " ").strip()
+    return out
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
    parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
+    parser.add_argument(
+        "--task-ids",
+        type=str,
+        default=None,
+        help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]",
+    )
    parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
    args = parser.parse_args()

+    task_ids: list[int] | None = None
+    if args.task_ids:
+        task_ids = [int(x.strip()) for x in args.task_ids.split(",")]
+
    descriptions: dict[str, str] = {}
    try:
-        if args.env == "libero":
+        if args.env == ("libero", "libero_plus"):
            descriptions = _libero_descriptions(args.task)
        elif args.env == "metaworld":
            descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robotwin":
+            descriptions = _robotwin_descriptions(args.task)
+        elif args.env == "robocasa":
+            descriptions = _robocasa_descriptions(args.task)
+        elif args.env == "robomme":
+            descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
+        elif args.env == "vlabench":
+            descriptions = _vlabench_descriptions(args.task)
        else:
            print(
                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
@@ -16,7 +16,7 @@ import datetime as dt
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal

 import draccus
 from huggingface_hub import hf_hub_download
@@ -58,6 +58,8 @@ class TrainPipelineConfig(HubMixin):
    batch_size: int = 8
    prefetch_factor: int = 4
    persistent_workers: bool = True
+    profile_mode: Literal["off", "summary", "trace"] = "off"
+    profile_output_dir: Path | None = None
    steps: int = 100_000
    eval_freq: int = 20_000
    log_freq: int = 200
@@ -130,9 +132,15 @@ class TrainPipelineConfig(HubMixin):
            now = dt.datetime.now()
            train_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
            self.output_dir = Path("outputs/train") / train_dir
+        if self.profile_mode != "off" and self.profile_output_dir is None:
+            self.profile_output_dir = self.output_dir / "profiling"

        if isinstance(self.dataset.repo_id, list):
            raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")
+        if self.profile_mode not in {"off", "summary", "trace"}:
+            raise ValueError(
+                f"`profile_mode` must be one of 'off', 'summary', or 'trace', got {self.profile_mode}."
+            )

        if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
            raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
@@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig):
    camera_name_mapping: dict[str, str] | None = None
    observation_height: int = 360
    observation_width: int = 360
+    is_libero_plus: bool = False
    features: dict[str, PolicyFeature] = field(
        default_factory=lambda: {
            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
@@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig):
            control_mode=self.control_mode,
            episode_length=self.episode_length,
            camera_name_mapping=self.camera_name_mapping,
+            is_libero_plus=self.is_libero_plus,
        )

    def get_env_processors(self):
@@ -496,6 +498,146 @@ class MetaworldEnv(EnvConfig):
        )


+@EnvConfig.register_subclass("robocasa")
+@dataclass
+class RoboCasaEnv(EnvConfig):
+    task: str = "CloseFridge"
+    fps: int = 20
+    episode_length: int = 1000
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
+    observation_height: int = 256
+    observation_width: int = 256
+    visualization_height: int = 512
+    visualization_width: int = 512
+    split: str | None = None
+    # Object-mesh registries to sample from. Upstream default is
+    # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
+    # only ships the lightwheel pack. Override to include objaverse once
+    # you've run `python -m robocasa.scripts.download_kitchen_assets
+    # --type objaverse` locally.
+    obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
+    )
+    features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
+
+    def __post_init__(self):
+        if self.obs_type not in ("pixels", "pixels_agent_pos"):
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        # Preserve raw RoboCasa camera names end-to-end (e.g.
+        # `observation.images.robot0_agentview_left`). This matches the
+        # naming convention used by the RoboCasa datasets on the Hub, so
+        # trained policies don't need a `--rename_map` at eval time.
+        cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
+        for cam in cams:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
+
+    @property
+    def gym_kwargs(self) -> dict:
+        kwargs: dict[str, Any] = {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "observation_height": self.observation_height,
+            "observation_width": self.observation_width,
+            "visualization_height": self.visualization_height,
+            "visualization_width": self.visualization_width,
+        }
+        if self.split is not None:
+            kwargs["split"] = self.split
+        return kwargs
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = False):
+        from .robocasa import create_robocasa_envs
+
+        if self.task is None:
+            raise ValueError("RoboCasaEnv requires a task to be specified")
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robocasa_envs(
+            task=self.task,
+            n_envs=n_envs,
+            camera_name=self.camera_name,
+            gym_kwargs=self.gym_kwargs,
+            env_cls=env_cls,
+            episode_length=self.episode_length,
+            obj_registries=tuple(self.obj_registries),
+        )
+
+
+@EnvConfig.register_subclass("vlabench")
+@dataclass
+class VLABenchEnv(EnvConfig):
+    task: str = "select_fruit"
+    fps: int = 10
+    episode_length: int = 500
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    render_resolution: tuple[int, int] = (480, 480)
+    robot: str = "franka"
+    action_mode: str = "eef"
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "agent_pos": OBS_STATE,
+            "pixels/image": f"{OBS_IMAGES}.image",
+            "pixels/second_image": f"{OBS_IMAGES}.second_image",
+            "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
+        }
+    )
+
+    def __post_init__(self):
+        h, w = self.render_resolution
+        if self.obs_type == "pixels":
+            self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+        elif self.obs_type == "pixels_agent_pos":
+            self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,))
+        else:
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "render_resolution": self.render_resolution,
+            "robot": self.robot,
+            "max_episode_steps": self.episode_length,
+            "action_mode": self.action_mode,
+        }
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = False):
+        from .vlabench import create_vlabench_envs
+
+        if self.task is None:
+            raise ValueError("VLABenchEnv requires a task to be specified")
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_vlabench_envs(
+            task=self.task,
+            n_envs=n_envs,
+            gym_kwargs=self.gym_kwargs,
+            env_cls=env_cls,
+        )
+
+
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
 class IsaaclabArenaEnv(HubEnvConfig):
@@ -574,3 +716,171 @@ class IsaaclabArenaEnv(HubEnvConfig):
            ),
            PolicyProcessorPipeline(steps=[]),
        )
+
+
+@EnvConfig.register_subclass("libero_plus")
+@dataclass
+class LiberoPlusEnv(LiberoEnv):
+    """Config for LIBERO-plus robustness benchmark evaluation.
+
+    LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints,
+    object layouts, robot initial states, language instructions, lighting, background
+    textures, sensor noise) producing ~10k task variants.
+
+    The gym interface is identical to LIBERO so this class reuses ``LiberoEnv``
+    entirely — only the registered name and default task suite differ.
+
+    Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships
+    as a namespace package from a git fork and must be cloned + PYTHONPATH'd
+    rather than installed as a pyproject extra.
+
+    See Also:
+        https://github.com/sylvestf/LIBERO-plus
+    """
+
+    task: str = "libero_spatial"
+    is_libero_plus: bool = True
+
+
+@EnvConfig.register_subclass("robotwin")
+@dataclass
+class RoboTwinEnvConfig(EnvConfig):
+    """Configuration for RoboTwin 2.0 benchmark environments.
+
+    RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
+    SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
+    (7 per arm). All three cameras are enabled by default.
+
+    See: https://robotwin-platform.github.io
+    Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
+    """
+
+    task: str = "beat_block_hammer"  # single task or comma-separated list
+    fps: int = 25
+    episode_length: int = 300
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
+    # (torso-mounted) + left_camera / right_camera (wrists).
+    camera_names: str = "head_camera,left_camera,right_camera"
+    # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
+    # Gym's vector-env concatenate pre-allocates buffers of this shape, so it
+    # must equal what SAPIEN actually renders.
+    observation_height: int = 240
+    observation_width: int = 320
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "pixels/head_camera": f"{OBS_IMAGES}.head_camera",
+            "pixels/left_camera": f"{OBS_IMAGES}.left_camera",
+            "pixels/right_camera": f"{OBS_IMAGES}.right_camera",
+            "agent_pos": OBS_STATE,
+        }
+    )
+
+    def __post_init__(self):
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        for cam in cam_list:
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            # Keep features_map entry if already set (default_factory); add if missing.
+            key = f"pixels/{cam}"
+            if key not in self.features_map:
+                self.features_map[key] = f"{OBS_IMAGES}.{cam}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(14,),  # 14 DOF: 7 per arm
+            )
+        elif self.obs_type != "pixels":
+            raise ValueError(
+                f"Unsupported obs_type '{self.obs_type}'. "
+                "RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
+            )
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from lerobot.envs.robotwin import create_robotwin_envs
+
+        if not self.task:
+            raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
+        return create_robotwin_envs(
+            task=self.task,
+            n_envs=n_envs,
+            env_cls=env_cls,
+            camera_names=cam_list,
+            observation_height=self.observation_height,
+            observation_width=self.observation_width,
+            episode_length=self.episode_length,
+        )
+
+
+@EnvConfig.register_subclass("robomme")
+@dataclass
+class RoboMMEEnv(EnvConfig):
+    """RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
+
+    16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
+    Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes).
+    Benchmark: https://github.com/RoboMME/robomme_benchmark
+
+    Requires the `robomme` git package installed separately (Linux only);
+    see docker/Dockerfile.benchmark.robomme for the canonical install.
+    """
+
+    task: str = "PickXtimes"
+    fps: int = 10
+    episode_length: int = 300
+    action_space: str = "joint_angle"  # or "ee_pose" (7-D)
+    dataset_split: str = "test"  # "train" | "val" | "test"
+    task_ids: list[int] | None = None
+    features: dict[str, PolicyFeature] = field(default_factory=dict)
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "pixels/image": f"{OBS_IMAGES}.image",
+            "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
+            "agent_pos": OBS_STATE,
+        }
+    )
+
+    def __post_init__(self):
+        action_dim = 8 if self.action_space == "joint_angle" else 7
+        self.features = {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)),
+            "pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)),
+        }
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robomme_envs(
+            task=self.task,
+            n_envs=n_envs,
+            action_space_type=self.action_space,
+            dataset=self.dataset_split,
+            episode_length=self.episode_length,
+            task_ids=self.task_ids,
+            env_cls=env_cls,
+        )
@@ -16,6 +16,7 @@
 from __future__ import annotations

 import os
+import re
 from collections import defaultdict
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
@@ -31,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv

 from lerobot.types import RobotObservation

-from .utils import _LazyAsyncVectorEnv
-
-
-def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
-    """Normalize camera_name into a non-empty list of strings."""
-    if isinstance(camera_name, str):
-        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
-    elif isinstance(camera_name, (list | tuple)):
-        cams = [str(c).strip() for c in camera_name if str(c).strip()]
-    else:
-        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
-    if not cams:
-        raise ValueError("camera_name resolved to an empty list.")
-    return cams
+from .utils import _LazyAsyncVectorEnv, parse_camera_names


 def _get_suite(name: str) -> benchmark.Benchmark:
@@ -69,14 +57,34 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
    return ids


-def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
-    init_states_path = (
-        Path(get_libero_path("init_states"))
-        / task_suite.tasks[i].problem_folder
-        / task_suite.tasks[i].init_states_file
-    )
-    init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
-    return init_states
+# LIBERO-plus perturbation variants encode the perturbation in the filename
+# but on disk only the base `.pruned_init` exists — strip the suffix to match
+# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
+# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
+_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
+
+
+def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray:
+    task = task_suite.tasks[i]
+    filename = Path(task.init_states_file)
+    root = Path(get_libero_path("init_states"))
+
+    if not is_libero_plus:
+        init_states_path = root / task.problem_folder / filename.name
+        return torch.load(init_states_path, weights_only=False)  # nosec B614
+
+    # LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under
+    # libero_newobj/ as a flat array that must be reshaped to (1, -1).
+    if "_add_" in filename.name or "_level" in filename.name:
+        init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
+        init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
+        return init_states.reshape(1, -1)
+
+    # LIBERO-plus perturbation variants encode the perturbation in the filename
+    # but on disk only the base `.pruned_init` exists — strip the suffix to match.
+    stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
+    init_states_path = root / task.problem_folder / stripped
+    return torch.load(init_states_path, weights_only=False)  # nosec B614


 def get_libero_dummy_action():
@@ -118,9 +126,11 @@ class LiberoEnv(gym.Env):
        camera_name_mapping: dict[str, str] | None = None,
        num_steps_wait: int = 10,
        control_mode: str = "relative",
+        is_libero_plus: bool = False,
    ):
        super().__init__()
        self.task_id = task_id
+        self.is_libero_plus = is_libero_plus
        self.obs_type = obs_type
        self.render_mode = render_mode
        self.observation_width = observation_width
@@ -128,7 +138,7 @@ class LiberoEnv(gym.Env):
        self.visualization_width = visualization_width
        self.visualization_height = visualization_height
        self.init_states = init_states
-        self.camera_name = _parse_camera_names(
+        self.camera_name = parse_camera_names(
            camera_name
        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)

@@ -147,7 +157,11 @@ class LiberoEnv(gym.Env):
        self.episode_index = episode_index
        self.episode_length = episode_length
        # Load once and keep
-        self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
+        self._init_states = (
+            get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus)
+            if self.init_states
+            else None
+        )
        self._reset_stride = n_envs  # when performing a reset, append `_reset_stride` to `init_state_id`.

        self.init_state_id = self.episode_index  # tie each sub-env to a fixed init state
@@ -380,6 +394,7 @@ def _make_env_fns(
    gym_kwargs: Mapping[str, Any],
    control_mode: str,
    camera_name_mapping: dict[str, str] | None = None,
+    is_libero_plus: bool = False,
 ) -> list[Callable[[], LiberoEnv]]:
    """Build n_envs factory callables for a single (suite, task_id)."""

@@ -396,6 +411,7 @@ def _make_env_fns(
            n_envs=n_envs,
            control_mode=control_mode,
            camera_name_mapping=camera_name_mapping,
+            is_libero_plus=is_libero_plus,
            **local_kwargs,
        )

@@ -418,6 +434,7 @@ def create_libero_envs(
    control_mode: str = "relative",
    episode_length: int | None = None,
    camera_name_mapping: dict[str, str] | None = None,
+    is_libero_plus: bool = False,
 ) -> dict[str, dict[int, Any]]:
    """
    Create vectorized LIBERO environments with a consistent return shape.
@@ -437,7 +454,7 @@ def create_libero_envs(
    gym_kwargs = dict(gym_kwargs or {})
    task_ids_filter = gym_kwargs.pop("task_ids", None)  # optional: limit to specific tasks

-    camera_names = _parse_camera_names(camera_name)
+    camera_names = parse_camera_names(camera_name)
    suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
    if not suite_names:
        raise ValueError("`task` must contain at least one LIBERO suite name.")
@@ -462,6 +479,7 @@ def create_libero_envs(
        # Probe once and reuse to avoid creating a temp env per task.
        cached_obs_space: spaces.Space | None = None
        cached_act_space: spaces.Space | None = None
+        cached_metadata: dict[str, Any] | None = None

        for tid in selected:
            fns = _make_env_fns(
@@ -475,12 +493,14 @@ def create_libero_envs(
                gym_kwargs=gym_kwargs,
                control_mode=control_mode,
                camera_name_mapping=camera_name_mapping,
+                is_libero_plus=is_libero_plus,
            )
            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[suite_name][tid] = lazy
            else:
                out[suite_name][tid] = env_cls(fns)
@@ -311,6 +311,7 @@ def create_metaworld_envs(
    is_async = env_cls is gym.vector.AsyncVectorEnv
    cached_obs_space = None
    cached_act_space = None
+    cached_metadata = None
    out: dict[str, dict[int, Any]] = defaultdict(dict)

    for group in task_groups:
@@ -324,10 +325,11 @@ def create_metaworld_envs(
            fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]

            if is_async:
-                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
                if cached_obs_space is None:
                    cached_obs_space = lazy.observation_space
                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
                out[group][tid] = lazy
            else:
                out[group][tid] = env_cls(fns)
@@ -0,0 +1,425 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv, parse_camera_names
+
+logger = logging.getLogger(__name__)
+
+# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
+# These correspond to the PandaOmron robot in RoboCasa365.
+OBS_STATE_DIM = 16  # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
+ACTION_DIM = 12  # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+
+# Default PandaOmron cameras. We surface these raw names directly as
+# `observation.images.<name>` so the LeRobot dataset/policy keys match
+# RoboCasa's native convention (no implicit renaming).
+DEFAULT_CAMERAS = [
+    "robot0_agentview_left",
+    "robot0_eye_in_hand",
+    "robot0_agentview_right",
+]
+
+# Object-mesh registries to sample from. RoboCasa's upstream default is
+# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
+# most users — including our CI image — only download the lightwheel pack
+# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
+# category has zero candidates in every registry, robocasa crashes with
+# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
+# normalization). Restricting to registries that are actually on disk
+# avoids the NaN and matches what the asset download provides.
+DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
+
+# Task-group shortcuts accepted as `--env.task`. When the user passes one of
+# these names, we expand it to the upstream RoboCasa task list and auto-set
+# the dataset split. Individual task names (optionally comma-separated) still
+# take precedence; this only triggers on an exact group-name match.
+_TASK_GROUP_SPLITS = {
+    "atomic_seen": "target",
+    "composite_seen": "target",
+    "composite_unseen": "target",
+    "pretrain50": "pretrain",
+    "pretrain100": "pretrain",
+    "pretrain200": "pretrain",
+    "pretrain300": "pretrain",
+}
+
+
+def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
+    """Resolve a `--env.task` value to (task_names, split_override).
+
+    If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
+    expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
+    and return the matching split. Otherwise treat `task` as a single task or
+    comma-separated list and leave the split untouched (None).
+    """
+    key = task.strip()
+    if key in _TASK_GROUP_SPLITS:
+        from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
+
+        combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
+        if key not in combined:
+            raise ValueError(
+                f"Task group '{key}' is not available in this version of robocasa. "
+                f"Known groups: {sorted(combined.keys())}."
+            )
+        return list(combined[key]), _TASK_GROUP_SPLITS[key]
+
+    names = [t.strip() for t in task.split(",") if t.strip()]
+    if not names:
+        raise ValueError("`task` must contain at least one RoboCasa task name.")
+    return names, None
+
+
+def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
+    """Split a flat (12,) action vector into a RoboCasa action dict.
+
+    Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+    """
+    return {
+        "action.base_motion": flat_action[0:4],
+        "action.control_mode": flat_action[4:5],
+        "action.end_effector_position": flat_action[5:8],
+        "action.end_effector_rotation": flat_action[8:11],
+        "action.gripper_close": flat_action[11:12],
+    }
+
+
+class RoboCasaEnv(gym.Env):
+    """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
+
+    Wraps RoboCasaGymEnv from the robocasa package and converts its
+    dict-based observations and actions into the flat arrays LeRobot expects.
+    Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
+
+    def __init__(
+        self,
+        task: str,
+        camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+        obs_type: str = "pixels_agent_pos",
+        render_mode: str = "rgb_array",
+        observation_width: int = 256,
+        observation_height: int = 256,
+        visualization_width: int = 512,
+        visualization_height: int = 512,
+        split: str | None = None,
+        episode_length: int | None = None,
+        obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+        episode_index: int = 0,
+    ):
+        super().__init__()
+        self.task = task
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.observation_width = observation_width
+        self.observation_height = observation_height
+        self.visualization_width = visualization_width
+        self.visualization_height = visualization_height
+        self.split = split
+        self.obj_registries = tuple(obj_registries)
+        # Per-worker index (0..n_envs-1) used to spread the user-provided
+        # seed across factories so each sub-env explores a distinct layout
+        # even when the same seed is passed to `reset()`.
+        self.episode_index = int(episode_index)
+
+        self.camera_name = parse_camera_names(camera_name)
+
+        self._max_episode_steps = episode_length if episode_length is not None else 1000
+
+        # Deferred — created on first reset() inside the worker subprocess
+        # to avoid inheriting stale GPU/EGL contexts across fork().
+        self._env: Any = None
+        self.task_description = ""
+
+        images = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_name
+        }
+
+        if self.obs_type == "pixels":
+            self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                    "agent_pos": spaces.Box(
+                        low=-np.inf,
+                        high=np.inf,
+                        shape=(OBS_STATE_DIM,),
+                        dtype=np.float32,
+                    ),
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
+
+        self.action_space = spaces.Box(
+            low=ACTION_LOW,
+            high=ACTION_HIGH,
+            shape=(ACTION_DIM,),
+            dtype=np.float32,
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the underlying RoboCasaGymEnv on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own clean rendering context rather than inheriting a stale one from
+        the parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
+
+        # RoboCasaGymEnv defaults split="test", which create_env rejects
+        # (only None/"all"/"pretrain"/"target" are valid). Always pass a
+        # valid value so we don't hit that default. Extra kwargs are
+        # forwarded to the underlying kitchen env via create_env/robosuite.make.
+        self._env = RoboCasaGymEnv(
+            env_name=self.task,
+            camera_widths=self.observation_width,
+            camera_heights=self.observation_height,
+            split=self.split if self.split is not None else "all",
+            obj_registries=self.obj_registries,
+        )
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+    def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
+        """Convert RoboCasaGymEnv observation dict to LeRobot format."""
+        # RoboCasaGymEnv emits camera frames under "video.<cam>".
+        images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
+
+        if self.obs_type == "pixels":
+            return {"pixels": images}
+
+        # `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
+        agent_pos = np.concatenate(
+            [
+                raw_obs.get("state.base_position", np.zeros(3)),
+                raw_obs.get("state.base_rotation", np.zeros(4)),
+                raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
+                raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
+                raw_obs.get("state.gripper_qpos", np.zeros(2)),
+            ],
+            axis=-1,
+        ).astype(np.float32)
+
+        return {"pixels": images, "agent_pos": agent_pos}
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        assert self._env is not None
+        return self._env.render()
+
+    def reset(self, seed=None, **kwargs):
+        self._ensure_env()
+        assert self._env is not None
+        super().reset(seed=seed)
+        # Spread the seed across workers so n_envs factories don't all
+        # roll the same scene. With an explicit user seed we shift it by
+        # episode_index; with no seed we fall back to episode_index so
+        # each worker is still distinct rather than inheriting the same
+        # global RNG state.
+        worker_seed = seed + self.episode_index if seed is not None else self.episode_index
+        raw_obs, info = self._env.reset(seed=worker_seed)
+
+        ep_meta = self._env.env.get_ep_meta()
+        self.task_description = ep_meta.get("lang", self.task)
+
+        observation = self._format_raw_obs(raw_obs)
+        info = {"is_success": False}
+        return observation, info
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        self._ensure_env()
+        assert self._env is not None
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
+
+        action_dict = convert_action(action)
+        raw_obs, reward, done, truncated, info = self._env.step(action_dict)
+
+        is_success = bool(info.get("success", False))
+        terminated = done or is_success
+        info.update({"task": self.task, "done": done, "is_success": is_success})
+
+        observation = self._format_raw_obs(raw_obs)
+        if terminated:
+            info["final_info"] = {
+                "task": self.task,
+                "done": bool(done),
+                "is_success": bool(is_success),
+            }
+            self.reset()
+
+        return observation, reward, terminated, truncated, info
+
+    def close(self):
+        if self._env is not None:
+            self._env.close()
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    camera_names: list[str],
+    obs_type: str,
+    render_mode: str,
+    observation_width: int,
+    observation_height: int,
+    visualization_width: int,
+    visualization_height: int,
+    split: str | None,
+    episode_length: int | None,
+    obj_registries: Sequence[str],
+) -> list[Callable[[], RoboCasaEnv]]:
+    """Build n_envs factory callables for a single task.
+
+    Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
+    ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
+    user-provided seed.
+    """
+
+    def _make_env(episode_index: int) -> RoboCasaEnv:
+        return RoboCasaEnv(
+            task=task,
+            camera_name=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+            episode_index=episode_index,
+        )
+
+    return [partial(_make_env, i) for i in range(n_envs)]
+
+
+def create_robocasa_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] | None = None,
+    camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    episode_length: int | None = None,
+    obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboCasa365 environments with a consistent return shape.
+
+    Returns:
+        dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+    `task` can be:
+      - a single task name (e.g. `CloseFridge`)
+      - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
+      - a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
+        `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
+        `pretrain300`), which auto-expands to the upstream task list and
+        auto-sets the dataset `split` ("target" or "pretrain").
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    gym_kwargs = dict(gym_kwargs or {})
+    obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
+    render_mode = gym_kwargs.pop("render_mode", "rgb_array")
+    observation_width = gym_kwargs.pop("observation_width", 256)
+    observation_height = gym_kwargs.pop("observation_height", 256)
+    visualization_width = gym_kwargs.pop("visualization_width", 512)
+    visualization_height = gym_kwargs.pop("visualization_height", 512)
+    split = gym_kwargs.pop("split", None)
+
+    camera_names = parse_camera_names(camera_name)
+    task_names, group_split = _resolve_tasks(str(task))
+    if group_split is not None and split is None:
+        split = group_split
+
+    logger.info(
+        "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
+        task_names,
+        split,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task=task_name,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            obs_type=obs_type,
+            render_mode=render_mode,
+            observation_width=observation_width,
+            observation_height=observation_height,
+            visualization_width=visualization_width,
+            visualization_height=visualization_height,
+            split=split,
+            episode_length=episode_length,
+            obj_registries=obj_registries,
+        )
+
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {name: dict(task_map) for name, task_map in out.items()}
@@ -0,0 +1,245 @@
+"""RoboMME environment wrapper for LeRobot evaluation.
+
+Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
+``VectorEnv`` suitable for ``lerobot_eval``.
+
+RoboMME tasks:
+  Counting:    BinFill, PickXtimes, SwingXtimes, StopCube
+  Permanence:  VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
+  Reference:   PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
+  Imitation:   MoveCube, InsertPeg, PatternLock, RouteStick
+
+Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes)
+Install: see docker/Dockerfile.benchmark.robomme  (Linux only — mani-skill vs numpy pin conflict)
+Benchmark: https://github.com/RoboMME/robomme_benchmark
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from .utils import _LazyAsyncVectorEnv
+
+ROBOMME_TASKS = [
+    "BinFill",
+    "PickXtimes",
+    "SwingXtimes",
+    "StopCube",
+    "VideoUnmask",
+    "VideoUnmaskSwap",
+    "ButtonUnmask",
+    "ButtonUnmaskSwap",
+    "PickHighlight",
+    "VideoRepick",
+    "VideoPlaceButton",
+    "VideoPlaceOrder",
+    "MoveCube",
+    "InsertPeg",
+    "PatternLock",
+    "RouteStick",
+]
+
+
+class RoboMMEGymEnv(gym.Env):
+    """Thin Gymnasium wrapper around a single RoboMME episode env."""
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+    def __init__(
+        self,
+        task: str = "PickXtimes",
+        action_space_type: str = "joint_angle",
+        dataset: str = "test",
+        episode_idx: int = 0,
+        max_steps: int = 300,
+    ):
+        super().__init__()
+        from robomme.env_record_wrapper import BenchmarkEnvBuilder
+
+        self._task = task
+        self._action_space_type = action_space_type
+        self._dataset = dataset
+        self._episode_idx = episode_idx
+        self._max_steps = max_steps
+        self._max_episode_steps = max_steps
+
+        self._builder = BenchmarkEnvBuilder(
+            env_id=task,
+            dataset=dataset,
+            action_space=action_space_type,
+            gui_render=False,
+            max_steps=max_steps,
+        )
+        self._env = None
+        self._last_raw_obs: dict | None = None
+
+        action_dim = 8 if action_space_type == "joint_angle" else 7
+        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
+        # `pixels` must be a nested Dict so `preprocess_observation()` in
+        # envs/utils.py picks it up and maps each camera to
+        # `observation.images.<cam>`. A flat layout (`pixels/image`,
+        # `pixels/wrist_image`) silently drops every image from the batch.
+        self.observation_space = spaces.Dict(
+            {
+                "pixels": spaces.Dict(
+                    {
+                        "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                        "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                    }
+                ),
+                "agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
+            }
+        )
+
+    def reset(self, *, seed=None, options=None):
+        super().reset(seed=seed)
+        self._env = self._builder.make_env_for_episode(
+            episode_idx=self._episode_idx,
+            max_steps=self._max_steps,
+        )
+        obs, info = self._env.reset()
+        self._last_raw_obs = obs
+        return self._convert_obs(obs), self._convert_info(info)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self._env.step(action)
+        self._last_raw_obs = obs
+
+        terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
+        truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
+
+        status = info.get("status", "ongoing")
+        is_success = status == "success"
+        conv_info = self._convert_info(info)
+        conv_info["is_success"] = is_success
+
+        return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
+
+    def render(self) -> np.ndarray | None:
+        """Return the front camera image from the last observation for video recording."""
+        if self._last_raw_obs is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        front = self._last_raw_obs.get("front_rgb_list")
+        if front is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        frame = front[-1] if isinstance(front, list) else front
+        return np.asarray(frame, dtype=np.uint8)
+
+    def _convert_obs(self, obs: dict) -> dict:
+        front_rgb = (
+            obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
+        )
+        wrist_rgb = (
+            obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
+        )
+        joint_state = (
+            obs["joint_state_list"][-1]
+            if isinstance(obs["joint_state_list"], list)
+            else obs["joint_state_list"]
+        )
+        gripper_state = (
+            obs["gripper_state_list"][-1]
+            if isinstance(obs["gripper_state_list"], list)
+            else obs["gripper_state_list"]
+        )
+
+        front_rgb = np.asarray(front_rgb, dtype=np.uint8)
+        wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
+        joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
+        gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
+        state = np.concatenate([joint, gripper])
+
+        return {
+            "pixels": {"image": front_rgb, "wrist_image": wrist_rgb},
+            "agent_pos": state,
+        }
+
+    def _convert_info(self, info: dict) -> dict:
+        return {
+            "status": info.get("status", "ongoing"),
+            "task_goal": info.get("task_goal", ""),
+        }
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    action_space_type: str,
+    dataset: str,
+    episode_length: int,
+    task_id: int,
+) -> list[Callable[[], RoboMMEGymEnv]]:
+    """Build n_envs factory callables for one RoboMME task id."""
+
+    def _make_one(episode_index: int) -> RoboMMEGymEnv:
+        return RoboMMEGymEnv(
+            task=task,
+            action_space_type=action_space_type,
+            dataset=dataset,
+            episode_idx=episode_index,
+            max_steps=episode_length,
+        )
+
+    return [partial(_make_one, task_id + i) for i in range(n_envs)]
+
+
+def create_robomme_envs(
+    task: str,
+    n_envs: int = 1,
+    action_space_type: str = "joint_angle",
+    dataset: str = "test",
+    episode_length: int = 300,
+    task_ids: list[int] | None = None,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, gym.vector.VectorEnv]]:
+    """Create vectorized RoboMME environments for evaluation.
+
+    `task` may be a single RoboMME task name (e.g. "PickXtimes") or a
+    comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task
+    becomes its own suite in the returned mapping.
+
+    Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    if task_ids is None:
+        task_ids = [0]
+
+    task_names = [t.strip() for t in task.split(",") if t.strip()]
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+    out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
+    for task_name in task_names:
+        envs_by_task: dict[int, gym.vector.VectorEnv] = {}
+        for task_id in task_ids:
+            fns = _make_env_fns(
+                task=task_name,
+                n_envs=n_envs,
+                action_space_type=action_space_type,
+                dataset=dataset,
+                episode_length=episode_length,
+                task_id=task_id,
+            )
+            if is_async:
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+                if cached_obs_space is None:
+                    cached_obs_space = lazy.observation_space
+                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
+                envs_by_task[task_id] = lazy
+            else:
+                envs_by_task[task_id] = env_cls(fns)
+        out[task_name] = envs_by_task
+    return out
@@ -0,0 +1,488 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import importlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
+# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
+ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
+    "head_camera",
+    "left_camera",
+    "right_camera",
+)
+
+ACTION_DIM = 14  # 7 DOF × 2 arms
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+DEFAULT_EPISODE_LENGTH = 300
+# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
+DEFAULT_CAMERA_H = 240
+DEFAULT_CAMERA_W = 320
+
+# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
+# (50 tasks as of main; earlier revisions had 60 with a different split).
+# Keep this in sync with:
+#   gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
+#     | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
+ROBOTWIN_TASKS: tuple[str, ...] = (
+    "adjust_bottle",
+    "beat_block_hammer",
+    "blocks_ranking_rgb",
+    "blocks_ranking_size",
+    "click_alarmclock",
+    "click_bell",
+    "dump_bin_bigbin",
+    "grab_roller",
+    "handover_block",
+    "handover_mic",
+    "hanging_mug",
+    "lift_pot",
+    "move_can_pot",
+    "move_pillbottle_pad",
+    "move_playingcard_away",
+    "move_stapler_pad",
+    "open_laptop",
+    "open_microwave",
+    "pick_diverse_bottles",
+    "pick_dual_bottles",
+    "place_a2b_left",
+    "place_a2b_right",
+    "place_bread_basket",
+    "place_bread_skillet",
+    "place_burger_fries",
+    "place_can_basket",
+    "place_cans_plasticbox",
+    "place_container_plate",
+    "place_dual_shoes",
+    "place_empty_cup",
+    "place_fan",
+    "place_mouse_pad",
+    "place_object_basket",
+    "place_object_scale",
+    "place_object_stand",
+    "place_phone_stand",
+    "place_shoe",
+    "press_stapler",
+    "put_bottles_dustbin",
+    "put_object_cabinet",
+    "rotate_qrcode",
+    "scan_object",
+    "shake_bottle",
+    "shake_bottle_horizontally",
+    "stack_blocks_three",
+    "stack_blocks_two",
+    "stack_bowls_three",
+    "stack_bowls_two",
+    "stamp_seal",
+    "turn_switch",
+)
+
+
+_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
+
+
+def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
+    """Build the kwargs dict RoboTwin's setup_demo expects.
+
+    Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
+    reads ``task_config/demo_clean.yml``, resolves the embodiment file from
+    ``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
+    reads camera dimensions from ``_camera_config.yml``.
+
+    Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
+    used by beat_block_hammer and most smoke-test tasks).
+    """
+    if task_name in _ROBOTWIN_SETUP_CACHE:
+        return dict(_ROBOTWIN_SETUP_CACHE[task_name])
+
+    import os
+
+    import yaml  # type: ignore[import-untyped]
+    from envs import CONFIGS_PATH  # type: ignore[import-not-found]
+
+    task_config = "demo_clean"
+    with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+
+    # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
+    with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
+        embodiment_types = yaml.safe_load(f)
+    embodiment = args.get("embodiment", ["aloha-agilex"])
+    if len(embodiment) == 1:
+        robot_file = embodiment_types[embodiment[0]]["file_path"]
+        args["left_robot_file"] = robot_file
+        args["right_robot_file"] = robot_file
+        args["dual_arm_embodied"] = True
+    elif len(embodiment) == 3:
+        args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
+        args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
+        args["embodiment_dis"] = embodiment[2]
+        args["dual_arm_embodied"] = False
+    else:
+        raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
+
+    with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["left_embodiment_config"] = yaml.safe_load(f)
+    with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
+        args["right_embodiment_config"] = yaml.safe_load(f)
+
+    # Camera dimensions
+    with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
+        camera_config = yaml.safe_load(f)
+    head_cam = args["camera"]["head_camera_type"]
+    args["head_camera_h"] = camera_config[head_cam]["h"]
+    args["head_camera_w"] = camera_config[head_cam]["w"]
+
+    # Headless overrides
+    args["render_freq"] = 0
+    args["task_name"] = task_name
+    args["task_config"] = task_config
+
+    _ROBOTWIN_SETUP_CACHE[task_name] = args
+    return dict(args)
+
+
+def _load_robotwin_task(task_name: str) -> type:
+    """Dynamically import and return a RoboTwin 2.0 task class.
+
+    RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
+    root and are expected to be on ``sys.path`` after installation.
+    """
+    try:
+        module = importlib.import_module(f"envs.{task_name}")
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            f"Could not import RoboTwin task '{task_name}'. "
+            "Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
+            "See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
+        ) from e
+    task_cls = getattr(module, task_name, None)
+    if task_cls is None:
+        raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
+    return task_cls
+
+
+class RoboTwinEnv(gym.Env):
+    """Gymnasium wrapper around a single RoboTwin 2.0 task.
+
+    RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
+    ``take_action`` / ``check_success``) rather than the standard gym interface.
+    This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
+    RoboTwin exactly like LIBERO or Meta-World.
+
+    The underlying SAPIEN environment is created lazily on the first ``reset()``
+    call *inside the worker process*.  This is required for
+    ``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
+    contexts that must not be forked from the parent process.
+
+    Observations
+    ------------
+    The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
+    ``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
+    ``envs/utils.py`` then converts these to ``observation.images.<cam>``.
+
+    Actions
+    -------
+    14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
+
+    Autograd
+    --------
+    ``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
+    optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
+    the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
+
+    def __init__(
+        self,
+        task_name: str,
+        episode_index: int = 0,
+        n_envs: int = 1,
+        camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+        observation_height: int | None = None,
+        observation_width: int | None = None,
+        episode_length: int = DEFAULT_EPISODE_LENGTH,
+        render_mode: str = "rgb_array",
+    ):
+        super().__init__()
+        self.task_name = task_name
+        self.task = task_name  # used by add_envs_task() in utils.py
+        self.task_description = task_name.replace("_", " ")
+        self.episode_index = episode_index
+        self._reset_stride = n_envs
+        self.camera_names = list(camera_names)
+        # Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
+        # The YAML-driven lookup is deferred to reset() so construction doesn't
+        # import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
+        self.observation_height = observation_height or DEFAULT_CAMERA_H
+        self.observation_width = observation_width or DEFAULT_CAMERA_W
+        self.episode_length = episode_length
+        self._max_episode_steps = episode_length  # lerobot_eval.rollout reads this
+        self.render_mode = render_mode
+
+        self._env: Any | None = None  # deferred — created on first reset() inside worker
+        self._step_count: int = 0
+        self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
+
+        image_spaces = {
+            cam: spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+            for cam in self.camera_names
+        }
+        self.observation_space = spaces.Dict(
+            {
+                "pixels": spaces.Dict(image_spaces),
+                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
+            }
+        )
+        self.action_space = spaces.Box(
+            low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
+        )
+
+    def _ensure_env(self) -> None:
+        """Create the SAPIEN environment on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own EGL/GPU context rather than inheriting a stale one from the
+        parent process (which causes crashes with AsyncVectorEnv).
+        """
+        if self._env is not None:
+            return
+        task_cls = _load_robotwin_task(self.task_name)
+        self._env = task_cls()
+
+    def _get_obs(self) -> RobotObservation:
+        assert self._env is not None, "_get_obs called before _ensure_env()"
+        raw = self._env.get_obs()
+        cameras_raw = raw.get("observation", {})
+
+        images: dict[str, np.ndarray] = {}
+        for cam in self.camera_names:
+            cam_data = cameras_raw.get(cam)
+            img = cam_data.get("rgb") if cam_data else None
+            if img is None:
+                images[cam] = self._black_frame
+                continue
+            img = np.asarray(img, dtype=np.uint8)
+            if img.ndim == 2:
+                img = np.stack([img, img, img], axis=-1)
+            elif img.shape[-1] != 3:
+                img = img[..., :3]
+            images[cam] = img
+
+        ja = raw.get("joint_action") or {}
+        vec = ja.get("vector")
+        if vec is not None:
+            arr = np.asarray(vec, dtype=np.float32).ravel()
+            joint_state = (
+                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
+            )
+        else:
+            joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
+
+        return {"pixels": images, "agent_pos": joint_state}
+
+    def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
+        self._ensure_env()
+        super().reset(seed=seed)
+        assert self._env is not None  # set by _ensure_env() above
+
+        actual_seed = self.episode_index if seed is None else seed
+        setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
+        setup_kwargs.update(seed=actual_seed, is_test=True)
+        with torch.enable_grad():
+            self._env.setup_demo(**setup_kwargs)
+        self.episode_index += self._reset_stride
+        self._step_count = 0
+
+        obs = self._get_obs()
+        return obs, {"is_success": False, "task": self.task_name}
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        assert self._env is not None, "step() called before reset()"
+        if action.ndim != 1 or action.shape[0] != ACTION_DIM:
+            raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
+
+        with torch.enable_grad():
+            if hasattr(self._env, "take_action"):
+                self._env.take_action(action)
+            else:
+                self._env.step(action)
+
+        self._step_count += 1
+
+        is_success = bool(getattr(self._env, "eval_success", False))
+        if not is_success and hasattr(self._env, "check_success"):
+            is_success = bool(self._env.check_success())
+
+        obs = self._get_obs()
+        reward = float(is_success)
+        terminated = is_success
+        truncated = self._step_count >= self.episode_length
+
+        info: dict[str, Any] = {
+            "task": self.task_name,
+            "is_success": is_success,
+            "step": self._step_count,
+        }
+        if terminated or truncated:
+            info["final_info"] = {
+                "task": self.task_name,
+                "is_success": is_success,
+            }
+            self.reset()
+
+        return obs, reward, terminated, truncated, info
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        obs = self._get_obs()
+        # Prefer head camera for rendering; fall back to first available.
+        if "head_camera" in obs["pixels"]:
+            return obs["pixels"]["head_camera"]
+        return next(iter(obs["pixels"].values()))
+
+    def close(self) -> None:
+        if self._env is not None:
+            if hasattr(self._env, "close_env"):
+                import contextlib
+
+                with contextlib.suppress(TypeError):
+                    self._env.close_env()
+            self._env = None
+
+
+# ---- Multi-task factory --------------------------------------------------------
+
+
+def _make_env_fns(
+    *,
+    task_name: str,
+    n_envs: int,
+    camera_names: list[str],
+    observation_height: int,
+    observation_width: int,
+    episode_length: int,
+) -> list[Callable[[], RoboTwinEnv]]:
+    """Return n_envs factory callables for a single task."""
+
+    def _make_one(episode_index: int) -> RoboTwinEnv:
+        return RoboTwinEnv(
+            task_name=task_name,
+            episode_index=episode_index,
+            n_envs=n_envs,
+            camera_names=camera_names,
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+
+    return [partial(_make_one, i) for i in range(n_envs)]
+
+
+def create_robotwin_envs(
+    task: str,
+    n_envs: int,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
+    observation_height: int = DEFAULT_CAMERA_H,
+    observation_width: int = DEFAULT_CAMERA_W,
+    episode_length: int = DEFAULT_EPISODE_LENGTH,
+) -> dict[str, dict[int, Any]]:
+    """Create vectorized RoboTwin 2.0 environments.
+
+    Returns:
+        ``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
+        ``n_envs`` parallel rollouts.
+
+    Args:
+        task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
+            or ``"beat_block_hammer,click_bell"``).
+        n_envs: Number of parallel rollouts per task.
+        env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
+        camera_names: Cameras to include in observations.
+        observation_height: Pixel height for all cameras.
+        observation_width: Pixel width for all cameras.
+        episode_length: Max steps before truncation.
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    task_names = [t.strip() for t in str(task).split(",") if t.strip()]
+    if not task_names:
+        raise ValueError("`task` must contain at least one RoboTwin task name.")
+
+    unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
+    if unknown:
+        raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
+
+    logger.info(
+        "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
+        task_names,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
+
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+    for task_name in task_names:
+        fns = _make_env_fns(
+            task_name=task_name,
+            n_envs=n_envs,
+            camera_names=list(camera_names),
+            observation_height=observation_height,
+            observation_width=observation_width,
+            episode_length=episode_length,
+        )
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+    return {k: dict(v) for k, v in out.items()}
@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
 from .configs import EnvConfig


+def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
+    """Normalize ``camera_name`` into a non-empty list of strings.
+
+    Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
+    strings (tuples/lists). Whitespace is stripped; empty entries are
+    dropped. Raises ``TypeError`` for unsupported input types and
+    ``ValueError`` when the normalized list is empty.
+    """
+    if isinstance(camera_name, str):
+        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
+    elif isinstance(camera_name, (list | tuple)):
+        cams = [str(c).strip() for c in camera_name if str(c).strip()]
+    else:
+        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
+    if not cams:
+        raise ValueError("camera_name resolved to an empty list.")
+    return cams
+
+
 def _convert_nested_dict(d):
    result = {}
    for k, v in d.items():
@@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv:
        env_fns: list[Callable],
        observation_space=None,
        action_space=None,
+        metadata=None,
    ):
        self._env_fns = env_fns
        self._env: gym.vector.AsyncVectorEnv | None = None
        self.num_envs = len(env_fns)
-        if observation_space is not None and action_space is not None:
+        if observation_space is not None and action_space is not None and metadata is not None:
            self.observation_space = observation_space
            self.action_space = action_space
+            self.metadata = metadata
        else:
            tmp = env_fns[0]()
            self.observation_space = tmp.observation_space
            self.action_space = tmp.action_space
+            self.metadata = tmp.metadata
            tmp.close()
        self.single_observation_space = self.observation_space
        self.single_action_space = self.action_space
@@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv:
        if self._env is None:
            self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)

+    @property
+    def unwrapped(self):
+        return self
+
    def reset(self, **kwargs):
        self._ensure()
        return self._env.reset(**kwargs)
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VLABench environment wrapper for LeRobot.
+
+VLABench is a large-scale benchmark for language-conditioned robotic manipulation
+with long-horizon reasoning, built on MuJoCo/dm_control.
+
+- Paper: https://arxiv.org/abs/2412.18194
+- GitHub: https://github.com/OpenMOSS/VLABench
+- Website: https://vlabench.github.io
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from typing import Any
+
+import cv2
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+from scipy.spatial.transform import Rotation
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
+ACTION_DIM = 7  # pos(3) + euler(3) + gripper(1)
+ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32)
+ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
+
+# Default max episode steps per task type
+DEFAULT_MAX_EPISODE_STEPS = 500
+
+# VLABench task suites
+PRIMITIVE_TASKS = [
+    "select_fruit",
+    "select_toy",
+    "select_chemistry_tube",
+    "add_condiment",
+    "select_book",
+    "select_painting",
+    "select_drink",
+    "insert_flower",
+    "select_billiards",
+    "select_ingredient",
+    "select_mahjong",
+    "select_poker",
+    # Physical series
+    "density_qa",
+    "friction_qa",
+    "magnetism_qa",
+    "reflection_qa",
+    "simple_cuestick_usage",
+    "simple_seesaw_usage",
+    "sound_speed_qa",
+    "thermal_expansion_qa",
+    "weight_qa",
+]
+
+COMPOSITE_TASKS = [
+    "cluster_billiards",
+    "cluster_book",
+    "cluster_drink",
+    "cluster_toy",
+    "cook_dishes",
+    "cool_drink",
+    "find_unseen_object",
+    "get_coffee",
+    "hammer_nail",
+    "heat_food",
+    "make_juice",
+    "play_mahjong",
+    "play_math_game",
+    "play_poker",
+    "play_snooker",
+    "rearrange_book",
+    "rearrange_chemistry_tube",
+    "set_dining_table",
+    "set_study_table",
+    "store_food",
+    "take_chemistry_experiment",
+    "use_seesaw_complex",
+]
+
+SUITE_TASKS: dict[str, list[str]] = {
+    "primitive": PRIMITIVE_TASKS,
+    "composite": COMPOSITE_TASKS,
+}
+
+
+class VLABenchEnv(gym.Env):
+    """Gymnasium wrapper for VLABench environments.
+
+    Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface.
+    Supports multiple cameras (front, second, wrist) and end-effector control.
+    """
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+    def __init__(
+        self,
+        task: str = "select_fruit",
+        obs_type: str = "pixels_agent_pos",
+        render_mode: str = "rgb_array",
+        render_resolution: tuple[int, int] = (480, 480),
+        robot: str = "franka",
+        max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS,
+        action_mode: str = "eef",
+    ):
+        super().__init__()
+        self.task = task
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.render_resolution = render_resolution
+        self.robot = robot
+        self._max_episode_steps = max_episode_steps
+        self.action_mode = action_mode
+
+        # Deferred — created on first reset() inside worker subprocess to avoid
+        # inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers.
+        # We never cache `env.physics`: dm_control exposes it as a weakref
+        # proxy that goes stale across resets (rebuilds the sim), so we always
+        # refetch it via `self._env.physics` at the call site.
+        self._env = None
+        self.task_description = ""  # populated on first reset
+        # Cached world-frame XYZ of the robot base link. The VLABench datasets
+        # log both `observation.state` positions and `actions` positions in
+        # robot-base frame (see VLABench/scripts/convert_to_lerobot.py which
+        # subtracts `robot_frame_pos` from ee_pos). The robot is attached at a
+        # fixed offset per task so this is safe to cache once per env build.
+        self._robot_base_xyz: np.ndarray | None = None
+
+        h, w = self.render_resolution
+
+        if self.obs_type == "state":
+            raise NotImplementedError(
+                "The 'state' observation type is not supported in VLABenchEnv. "
+                "Please use 'pixels' or 'pixels_agent_pos'."
+            )
+        elif self.obs_type == "pixels":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(
+                        {
+                            "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                        }
+                    ),
+                }
+            )
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(
+                        {
+                            "image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                            "wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
+                        }
+                    ),
+                    "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64),
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32)
+
+    # Max attempts to rebuild the underlying env when MuJoCo throws
+    # `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step
+    # reset warm-up. Some random task/layout samples land in unstable
+    # initial configurations; re-sampling the layout almost always
+    # gives a stable one. A handful of upstream tasks (notably
+    # `select_mahjong`) have layout samplers that diverge often enough
+    # to need >>5 retries, so we pick a generous ceiling.
+    _ENSURE_ENV_MAX_ATTEMPTS = 20
+
+    def _ensure_env(self) -> None:
+        """Create the underlying VLABench env on first use.
+
+        Called inside the worker subprocess after fork(), so each worker gets
+        its own clean rendering context rather than inheriting a stale one from
+        the parent process (which causes crashes with AsyncVectorEnv).
+
+        Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20
+        warm-up `step()` calls while toggling gravity/fluids to let the scene
+        settle; for some random layouts MuJoCo's integrator diverges and
+        raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields
+        a stable one, so we retry a number of times before giving up. Between
+        attempts we reseed NumPy's global RNG from OS entropy so the upstream
+        task sampler explores fresh initial states — without this, retries
+        can replay the same diverging configuration when the sampler is
+        deterministic given the current RNG state.
+        """
+        if self._env is not None:
+            return
+
+        import VLABench.robots  # noqa: F401  # type: ignore[import-untyped]
+        import VLABench.tasks  # noqa: F401  # type: ignore[import-untyped]
+        from dm_control.rl.control import PhysicsError  # type: ignore[import-untyped]
+        from VLABench.envs import load_env  # type: ignore[import-untyped]
+
+        h, w = self.render_resolution
+        last_exc: PhysicsError | None = None
+        for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1):
+            try:
+                env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w))
+                self._env = env
+                break
+            except PhysicsError as exc:
+                last_exc = exc
+                logger.warning(
+                    "PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…",
+                    attempt,
+                    self._ENSURE_ENV_MAX_ATTEMPTS,
+                    self.task,
+                    exc,
+                )
+                np.random.seed(None)
+        if self._env is None:
+            assert last_exc is not None
+            raise RuntimeError(
+                f"VLABench task '{self.task}' failed to produce a stable "
+                f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} "
+                f"attempts. This task's upstream sampler diverges too "
+                f"often for the configured robot; consider removing it "
+                f"from the eval set. Last physics error: {last_exc}"
+            ) from last_exc
+
+        # Extract task description from the dm_control task
+        task_obj = self._env.task
+        if hasattr(task_obj, "task_description"):
+            self.task_description = task_obj.task_description
+        elif hasattr(task_obj, "language_instruction"):
+            self.task_description = task_obj.language_instruction
+        else:
+            self.task_description = self.task
+
+        # Cache robot base world position so `_build_ctrl_from_action` and
+        # `_get_obs` can translate between robot-frame (dataset) and
+        # world-frame (dm_control) without hitting physics every call.
+        try:
+            self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape(
+                3
+            )
+        except Exception:
+            # Fallback to VLABench's default Franka base position.
+            self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64)
+
+    def _get_obs(self) -> dict:
+        """Get current observation from the environment."""
+        assert self._env is not None
+
+        obs = self._env.get_observation()
+        h, w = self.render_resolution
+
+        def _to_hwc3(arr: np.ndarray) -> np.ndarray:
+            """Coerce any camera array to the declared (h, w, 3) uint8 shape."""
+            a = np.asarray(arr)
+            # Drop a leading singleton batch dim if present.
+            while a.ndim > 3 and a.shape[0] == 1:
+                a = a[0]
+            if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4):
+                # CHW → HWC
+                a = np.transpose(a, (1, 2, 0))
+            if a.ndim == 2:
+                a = np.stack([a] * 3, axis=-1)
+            if a.ndim != 3:
+                return np.zeros((h, w, 3), dtype=np.uint8)
+            # Force 3 channels.
+            if a.shape[-1] == 1:
+                a = np.repeat(a, 3, axis=-1)
+            elif a.shape[-1] == 4:
+                a = a[..., :3]
+            elif a.shape[-1] != 3:
+                return np.zeros((h, w, 3), dtype=np.uint8)
+            if a.shape[:2] != (h, w):
+                a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA)
+            return a.astype(np.uint8)
+
+        # Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays
+        raw_frames: list[np.ndarray] = []
+        if "rgb" in obs:
+            rgb = obs["rgb"]
+            if isinstance(rgb, np.ndarray):
+                if rgb.ndim == 4:
+                    raw_frames = [rgb[i] for i in range(rgb.shape[0])]
+                elif rgb.ndim == 3:
+                    raw_frames = [rgb]
+
+        image_keys = ["image", "second_image", "wrist_image"]
+        images: dict[str, np.ndarray] = {}
+        for i, key in enumerate(image_keys):
+            if i < len(raw_frames):
+                images[key] = _to_hwc3(raw_frames[i])
+            else:
+                images[key] = np.zeros((h, w, 3), dtype=np.uint8)
+
+        # Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]`
+        # to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3),
+        # gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions
+        # are stored in robot-base frame and orientations as scipy extrinsic
+        # 'xyz' euler angles.
+        raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel()
+        pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64)
+        quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64)
+        gripper = float(raw[7]) if raw.size >= 8 else 0.0
+
+        base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+        pos_robot = pos_world - base
+        euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler(
+            "xyz", degrees=False
+        )
+
+        ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64)
+
+        if self.obs_type == "pixels":
+            return {"pixels": images}
+        elif self.obs_type == "pixels_agent_pos":
+            return {
+                "pixels": images,
+                "agent_pos": ee_state.astype(np.float64),
+            }
+        else:
+            raise ValueError(f"Unknown obs_type: {self.obs_type}")
+
+    # ---- Action adaptation (EEF → joint ctrl) --------------------------------
+    #
+    # The HF vlabench datasets log 7D actions
+    # `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`,
+    # exactly matching VLABench's own eval pipeline (evaluator.base):
+    #   pos, euler, g = policy(...)
+    #   quat = euler_to_quaternion(*euler)      # extrinsic xyz -> wxyz
+    #   _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat)
+    #   env.step(np.concatenate([qpos, [g, g]]))
+    #
+    # VLABench's dm_control task writes `data.ctrl[:] = action` directly — for
+    # Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the
+    # above conversion so the policy's EEF commands actually drive the robot.
+
+    _FRANKA_FINGER_OPEN = 0.04  # qpos when gripper fully open
+
+    def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray:
+        """Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector.
+
+        For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) +
+        2 gripper finger qposes (open/closed based on the gripper scalar).
+        If the action is already joint-space (shape matches ctrl_dim), pass
+        through.
+        """
+        if action.shape[0] == ctrl_dim:
+            return action.astype(np.float64, copy=False)
+
+        if action.shape[0] != 7:
+            # Unknown layout — fall back to zero-pad so the sim doesn't crash.
+            padded = np.zeros(ctrl_dim, dtype=np.float64)
+            padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim]
+            return padded
+
+        from dm_control.utils.inverse_kinematics import qpos_from_site_pose
+
+        # Action position is in robot-base frame (see convert_to_lerobot.py);
+        # dm_control's IK expects a world-frame target.
+        base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
+        pos_world = np.asarray(action[:3], dtype=np.float64) + base
+        rx, ry, rz = float(action[3]), float(action[4]), float(action[5])
+        gripper = float(np.clip(action[6], 0.0, 1.0))
+
+        # Dataset euler is scipy extrinsic 'xyz' (same as VLABench's
+        # `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK
+        # and MuJoCo use `[w, x, y, z]`, so reorder.
+        qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat()
+        quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64)
+
+        assert self._env is not None
+        robot = self._env.task.robot
+        site_name = robot.end_effector_site.full_identifier
+
+        # inplace=False so IK doesn't mutate physics state mid-step — we only
+        # want the solved qpos. Fetch a fresh physics handle — caching it can
+        # yield a stale weakref after a reset.
+        ik_result = qpos_from_site_pose(
+            self._env.physics,
+            site_name=site_name,
+            target_pos=pos_world,
+            target_quat=quat,
+            inplace=False,
+            max_steps=100,
+        )
+        n_dof = robot.n_dof  # 7 for Franka
+        arm_qpos = ik_result.qpos[:n_dof]
+
+        # Dataset gripper convention: 1 = open (finger qpos = 0.04),
+        # 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py
+        # where `trajectory[i][-1] > 0.03` is encoded as `1`.
+        finger_qpos = gripper * self._FRANKA_FINGER_OPEN
+
+        ctrl = np.zeros(ctrl_dim, dtype=np.float64)
+        ctrl[:n_dof] = arm_qpos
+        # Remaining entries are gripper fingers (usually 2 for Franka).
+        ctrl[n_dof:] = finger_qpos
+        return ctrl
+
+    def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]:
+        self._ensure_env()
+        assert self._env is not None
+        super().reset(seed=seed)
+
+        if seed is not None:
+            self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1)))
+
+        self._env.reset()
+
+        observation = self._get_obs()
+        info = {"is_success": False}
+        return observation, info
+
+    def _seed_inner_env(self, seed: int) -> None:
+        """Propagate `seed` to the inner dm_control env. `Environment.reset()`
+        doesn't accept a seed, so we re-seed the task and environment
+        `RandomState`s directly. Best-effort: silently skipped when the
+        expected attributes are absent on a given VLABench version.
+        """
+        for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")):
+            owner = getattr(self._env, owner_attr) if owner_attr else self._env
+            rng = getattr(owner, rng_attr, None)
+            rng_seed = getattr(rng, "seed", None)
+            if callable(rng_seed):
+                rng_seed(seed)
+
+    def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+        from dm_control.rl.control import PhysicsError  # type: ignore[import-untyped]
+
+        self._ensure_env()
+        assert self._env is not None
+
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
+
+        if self.action_mode not in ("eef", "joint", "delta_eef"):
+            raise ValueError(f"Unknown action_mode: {self.action_mode}")
+
+        # Always refetch physics — dm_control returns a weakref proxy that can
+        # go stale across resets.
+        physics = self._env.physics
+        ctrl_dim = int(physics.data.ctrl.shape[0])
+        ctrl = self._build_ctrl_from_action(action, ctrl_dim)
+        try:
+            timestep = self._env.step(ctrl)
+        except PhysicsError as exc:
+            # Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as
+            # a graceful failed termination rather than a hard crash — the
+            # rest of the multi-task eval should still run.
+            logger.warning(
+                "PhysicsError during step on task '%s': %s. Terminating episode.",
+                self.task,
+                exc,
+            )
+            observation = self._get_obs()
+            info = {"task": self.task, "is_success": False, "physics_error": True}
+            # Drop the stale env so the next reset() rebuilds it cleanly.
+            with contextlib.suppress(Exception):
+                self._env.close()
+            self._env = None
+            return observation, 0.0, True, False, info
+
+        # Extract reward from dm_control timestep
+        reward = float(timestep.reward) if timestep.reward is not None else 0.0
+
+        # Check success via the task's termination condition
+        is_success = False
+        if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"):
+            is_success = bool(self._env.task.should_terminate_episode(self._env.physics))
+
+        terminated = is_success
+        truncated = False
+        info = {
+            "task": self.task,
+            "is_success": is_success,
+        }
+
+        observation = self._get_obs()
+
+        if terminated:
+            self.reset()
+
+        return observation, reward, terminated, truncated, info
+
+    def render(self) -> np.ndarray:
+        self._ensure_env()
+        obs = self._get_obs()
+        return obs["pixels"]["image"]
+
+    def close(self):
+        if self._env is not None:
+            self._env.close()
+            self._env = None
+
+
+# ---- Main API ----------------------------------------------------------------
+
+
+def create_vlabench_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] | None = None,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, Any]]:
+    """
+    Create vectorized VLABench environments with a consistent return shape.
+
+    Returns:
+        dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+    Notes:
+        - n_envs is the number of rollouts *per task*.
+        - `task` can be a suite name ("primitive", "composite"), a comma-separated list of
+          suite names, or individual task names (e.g. "select_fruit,heat_food").
+    """
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    gym_kwargs = dict(gym_kwargs or {})
+    task_groups = [t.strip() for t in task.split(",") if t.strip()]
+    if not task_groups:
+        raise ValueError("`task` must contain at least one VLABench task or suite name.")
+
+    logger.info(
+        "Creating VLABench envs | task_groups=%s | n_envs(per task)=%d",
+        task_groups,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space = None
+    cached_act_space = None
+    cached_metadata = None
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+    for group in task_groups:
+        # Check if it's a suite name, otherwise treat as individual task
+        tasks = SUITE_TASKS.get(group, [group])
+
+        for tid, task_name in enumerate(tasks):
+            logger.info(
+                "Building vec env | group=%s | task_id=%d | task=%s",
+                group,
+                tid,
+                task_name,
+            )
+
+            fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
+
+            if is_async:
+                lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+                if cached_obs_space is None:
+                    cached_obs_space = lazy.observation_space
+                    cached_act_space = lazy.action_space
+                    cached_metadata = lazy.metadata
+                out[group][tid] = lazy
+            else:
+                out[group][tid] = env_cls(fns)
+
+    return {group: dict(task_map) for group, task_map in out.items()}
@@ -655,7 +655,6 @@ class VLAFlowMatching(nn.Module):
                pad_masks.append(image_start_mask)

            img_emb = self.vlm_with_expert.embed_image(img)
-            img_emb = img_emb

            # Normalize image embeddings
            img_emb_dim = img_emb.shape[-1]
@@ -49,6 +49,7 @@ from lerobot.optim.factory import make_optimizer_and_scheduler
 from lerobot.policies import PreTrainedPolicy, make_policy, make_pre_post_processors
 from lerobot.utils.import_utils import register_third_party_plugins
 from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
+from lerobot.utils.model_profiling import TrainingProfiler
 from lerobot.utils.random_utils import set_seed
 from lerobot.utils.utils import (
    cycle,
@@ -71,6 +72,7 @@ def update_policy(
    lr_scheduler=None,
    lock=None,
    rabc_weights_provider=None,
+    profiler: "TrainingProfiler | None" = None,
 ) -> tuple[MetricsTracker, dict]:
    """
    Performs a single training step to update the policy's weights.
@@ -103,8 +105,10 @@ def update_policy(
    if rabc_weights_provider is not None:
        rabc_batch_weights, rabc_batch_stats = rabc_weights_provider.compute_batch_weights(batch)

-    # Let accelerator handle mixed precision
-    with accelerator.autocast():
+    def _section(name: str) -> Any:
+        return profiler.section(name) if profiler is not None else nullcontext()
+
+    with _section("forward"), accelerator.autocast():
        # Use per-sample loss when RA-BC is enabled for proper weighting
        if rabc_batch_weights is not None:
            # Get per-sample losses
@@ -123,8 +127,8 @@ def update_policy(

        # TODO(rcadene): policy.unnormalize_outputs(out_dict)

-    # Use accelerator's backward method
-    accelerator.backward(loss)
+    with _section("backward"):
+        accelerator.backward(loss)

    # Clip gradients if specified
    if grad_clip_norm > 0:
@@ -134,8 +138,7 @@ def update_policy(
            policy.parameters(), float("inf"), error_if_nonfinite=False
        )

-    # Optimizer step
-    with lock if lock is not None else nullcontext():
+    with _section("optimizer"), lock if lock is not None else nullcontext():
        optimizer.step()

    optimizer.zero_grad()
@@ -316,6 +319,15 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        logging.info("Creating optimizer and scheduler")
    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)

+    profiler = (
+        TrainingProfiler.from_cfg(cfg, device) if cfg.profile_mode != "off" and is_main_process else None
+    )
+    if profiler:
+        profiler.record_deterministic_forward(
+            policy=policy, dataset=dataset, batch_size=cfg.batch_size, preprocessor=preprocessor
+        )
+        profiler.start()
+
    # Load precomputed SARM progress for RA-BC if enabled
    # Generate progress using: src/lerobot/policies/sarm/compute_rabc_weights.py
    rabc_weights = None
@@ -449,6 +461,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
            accelerator=accelerator,
            lr_scheduler=lr_scheduler,
            rabc_weights_provider=rabc_weights,
+            profiler=profiler,
        )

        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
@@ -456,6 +469,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        step += 1
        if is_main_process:
            progbar.update(1)
+        if profiler:
+            profiler.step(step, train_tracker)
        train_tracker.step()
        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 and is_main_process
        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
@@ -551,6 +566,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):

    if is_main_process:
        progbar.close()
+        if profiler:
+            profiler.finalize()

    if eval_env:
        close_envs(eval_env)
@@ -0,0 +1,783 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+"""Model profiling — single-file entry point.
+
+Contains three things that used to live in three separate files:
+
+* `TrainingProfiler` — hooks the training loop. Captures per-step
+  forward/backward/optimizer timings, the torch profiler output, and a
+  deterministic-forward fingerprint for regression detection.
+* `POLICY_SPECS` — CI matrix of `policy_name → (steps, train_args)`.
+  Inline so there is no separate JSON to keep in sync.
+* `main()` — CI orchestrator. For each selected policy, spawns a
+  `lerobot-train` subprocess with profiling enabled, collects the
+  artifacts, and (optionally) publishes a row to a HF Hub dataset.
+
+Usage (CI):
+
+    python -m lerobot.utils.model_profiling \
+        --output_dir=./profiling-results \
+        --policies act diffusion \
+        --profile_mode=trace \
+        --publish
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import logging
+import re
+import shutil
+import statistics
+import subprocess
+import time
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from numbers import Real
+from pathlib import Path
+from typing import Any
+
+import torch
+from huggingface_hub import CommitOperationAdd, HfApi
+from huggingface_hub.errors import HfHubHTTPError
+from torch.utils.data import default_collate
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Policy matrix. Same shape as the former JSON file; inlined so the source
+# tree has one less file to keep in sync with the training args.
+# ---------------------------------------------------------------------------
+
+_LIBERO_RENAME_BASE_RGB = (
+    '--rename_map={"observation.images.front": "observation.images.base_0_rgb", '
+    '"observation.images.wrist": "observation.images.left_wrist_0_rgb"}'
+)
+_LIBERO_RENAME_CAMERAS = (
+    '--rename_map={"observation.images.front": "observation.images.camera1", '
+    '"observation.images.wrist": "observation.images.camera2"}'
+)
+_PI_SGD = [
+    "--use_policy_training_preset=false",
+    "--optimizer.type=sgd",
+    "--optimizer.lr=1e-5",
+    "--optimizer.weight_decay=0",
+    "--optimizer.grad_clip_norm=1.0",
+    "--scheduler.type=cosine_decay_with_warmup",
+    "--scheduler.peak_lr=1e-5",
+    "--scheduler.decay_lr=1e-6",
+    "--scheduler.num_warmup_steps=0",
+    "--scheduler.num_decay_steps=12",
+]
+
+
+POLICY_SPECS: dict[str, dict[str, Any]] = {
+    "act": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/pusht",
+            "--dataset.episodes=[0]",
+            "--policy.type=act",
+            "--policy.device=cuda",
+            "--batch_size=4",
+            "--cudnn_deterministic=true",
+        ],
+    },
+    "diffusion": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/pusht",
+            "--dataset.episodes=[0]",
+            "--policy.type=diffusion",
+            "--policy.device=cuda",
+            "--batch_size=4",
+            "--cudnn_deterministic=true",
+        ],
+    },
+    "groot": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.type=groot",
+            "--policy.base_model_path=nvidia/GR00T-N1.5-3B",
+            "--policy.tune_diffusion_model=true",
+            "--policy.tune_projector=true",
+            "--policy.tune_llm=false",
+            "--policy.tune_visual=false",
+            "--policy.use_bf16=true",
+            "--policy.device=cuda",
+            "--batch_size=1",
+            '--rename_map={"observation.images.image": "observation.images.camera1", '
+            '"observation.images.image2": "observation.images.camera2"}',
+        ],
+    },
+    "multi_task_dit": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/pusht",
+            "--dataset.episodes=[0]",
+            "--policy.type=multi_task_dit",
+            "--policy.device=cuda",
+            "--policy.horizon=32",
+            "--policy.n_action_steps=30",
+            "--batch_size=4",
+            "--cudnn_deterministic=true",
+        ],
+    },
+    "pi0": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.path=lerobot/pi0_base",
+            "--policy.device=cuda",
+            "--policy.dtype=bfloat16",
+            "--policy.n_action_steps=30",
+            "--policy.use_amp=true",
+            "--policy.gradient_checkpointing=true",
+            "--batch_size=1",
+            *_PI_SGD,
+            _LIBERO_RENAME_BASE_RGB,
+        ],
+    },
+    "pi0_fast": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.path=lerobot/pi0fast-base",
+            "--policy.device=cuda",
+            "--policy.dtype=bfloat16",
+            "--policy.n_action_steps=30",
+            "--policy.use_amp=true",
+            "--policy.gradient_checkpointing=true",
+            "--batch_size=1",
+            *_PI_SGD,
+            _LIBERO_RENAME_BASE_RGB,
+        ],
+    },
+    "pi05": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.path=lerobot/pi05_base",
+            "--policy.device=cuda",
+            "--policy.dtype=bfloat16",
+            "--policy.n_action_steps=30",
+            "--policy.use_amp=true",
+            "--policy.gradient_checkpointing=true",
+            "--batch_size=1",
+            *_PI_SGD,
+            '--policy.normalization_mapping={"ACTION": "MEAN_STD", '
+            '"STATE": "MEAN_STD", "VISUAL": "IDENTITY"}',
+            _LIBERO_RENAME_BASE_RGB,
+        ],
+    },
+    "smolvla": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.path=lerobot/smolvla_base",
+            "--policy.load_vlm_weights=true",
+            "--policy.freeze_vision_encoder=false",
+            "--policy.train_expert_only=false",
+            "--policy.empty_cameras=1",
+            "--policy.device=cuda",
+            "--batch_size=1",
+            _LIBERO_RENAME_CAMERAS,
+        ],
+    },
+    "wall_x": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/aloha_sim_insertion_human",
+            "--dataset.episodes=[0]",
+            "--policy.type=wall_x",
+            "--policy.pretrained_name_or_path=x-square-robot/wall-oss-flow",
+            "--policy.prediction_mode=diffusion",
+            "--policy.attn_implementation=eager",
+            "--policy.device=cuda",
+            "--batch_size=1",
+            *_PI_SGD,
+        ],
+    },
+    "xvla": {
+        "steps": 12,
+        "train_args": [
+            "--dataset.repo_id=lerobot/libero_plus",
+            "--dataset.episodes=[0]",
+            "--policy.path=lerobot/xvla-widowx",
+            "--policy.action_mode=auto",
+            "--policy.empty_cameras=1",
+            "--policy.device=cuda",
+            "--batch_size=1",
+            '--rename_map={"observation.images.front": "observation.images.image", '
+            '"observation.images.wrist": "observation.images.image2"}',
+        ],
+    },
+}
+
+
+# ---------------------------------------------------------------------------
+# TrainingProfiler — hooks the training loop.
+# ---------------------------------------------------------------------------
+
+
+def _stable_float(value: float | int | None) -> float | None:
+    return None if value is None else round(float(value), 8)
+
+
+def _as_float(value: Any) -> float:
+    if isinstance(value, Real):
+        return float(value)
+    if hasattr(value, "val"):
+        return float(value.val)
+    raise TypeError(f"Expected a real-valued metric, got {type(value).__name__}")
+
+
+def _summary(values: list[float]) -> dict[str, float | int | None]:
+    if not values:
+        return {"count": 0, "mean": None, "median": None, "min": None, "max": None}
+    return {
+        "count": len(values),
+        "mean": statistics.fmean(values),
+        "median": statistics.median(values),
+        "min": min(values),
+        "max": max(values),
+    }
+
+
+def _tensor_signature(tensor: torch.Tensor) -> dict[str, Any]:
+    """Small, stable summary of a tensor so forward-pass outputs can be
+    compared across runs without bloating the regression JSON."""
+    cpu = tensor.detach().cpu()
+    hash_tensor = cpu.float() if cpu.dtype == torch.bfloat16 else cpu
+    sig: dict[str, Any] = {
+        "shape": list(cpu.shape),
+        "dtype": str(cpu.dtype),
+        "numel": cpu.numel(),
+        "sha256": hashlib.sha256(hash_tensor.contiguous().numpy().tobytes()).hexdigest(),
+    }
+    if cpu.numel():
+        promoted = cpu.to(torch.float64) if cpu.is_floating_point() else cpu.to(torch.int64)
+        sig["sum"] = _stable_float(promoted.sum().item())
+        sig["mean"] = _stable_float(promoted.float().mean().item())
+    return sig
+
+
+def _summarize_value(value: Any) -> Any:
+    if isinstance(value, torch.Tensor):
+        return _tensor_signature(value)
+    if isinstance(value, dict):
+        return {k: _summarize_value(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_summarize_value(v) for v in value]
+    if isinstance(value, (str, int, float, bool)) or value is None:
+        return value
+    return repr(value)
+
+
+def _hash_payload(payload: Any) -> str:
+    return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
+
+
+def _get_profiler_device_time_us(event: Any) -> float | None:
+    return _stable_float(
+        getattr(event, "self_device_time_total", getattr(event, "self_cuda_time_total", None))
+    )
+
+
+def _write_profiler_table(profiler: Any, path: Path, *, sort_by: str, row_limit: int = 40) -> None:
+    try:
+        path.write_text(profiler.key_averages().table(sort_by=sort_by, row_limit=row_limit))
+    except Exception:
+        logger.debug("Could not write profiler table for sort_by=%s", sort_by, exc_info=True)
+
+
+def write_deterministic_forward_artifacts(
+    *,
+    policy: Any,
+    dataset: Any,
+    batch_size: int,
+    preprocessor: Any,
+    output_dir: Path,
+    device_type: str,
+) -> None:
+    """Run a seed-controlled single forward pass and dump a stable fingerprint
+    (loss/output tensor hashes + op counts) for regression detection. Keeps
+    the caller-selected module mode so ACT-with-VAE-style policies that only
+    materialize their full forward outputs in `train()` still match. Models
+    with stochastic train-mode layers still rely on the seeded RNG for stable
+    fingerprints."""
+    if len(dataset) == 0:
+        raise ValueError("Cannot build a reference batch from an empty dataset.")
+    indices = [i % len(dataset) for i in range(batch_size)]
+    reference_batch = default_collate([dataset[i] for i in indices])
+    # Mirror the uint8 → float32/255 conversion the train loop applies after
+    # the dataloader (PR #3406). The dataset ships camera frames as uint8 for
+    # faster transport, but policies like SmolVLA/xVLA run bilinear
+    # interpolation on images which doesn't support Byte tensors.
+    camera_keys = tuple(getattr(getattr(dataset, "meta", None), "camera_keys", ()) or ())
+    if not camera_keys:
+        camera_keys = tuple(
+            key
+            for key, value in reference_batch.items()
+            if key.startswith("observation.images.") and isinstance(value, torch.Tensor)
+        )
+    for cam_key in camera_keys:
+        if cam_key in reference_batch and reference_batch[cam_key].dtype == torch.uint8:
+            reference_batch[cam_key] = reference_batch[cam_key].to(dtype=torch.float32) / 255.0
+    reference_batch = preprocessor(reference_batch)
+
+    activities = [torch.profiler.ProfilerActivity.CPU]
+    if device_type == "cuda":
+        activities.append(torch.profiler.ProfilerActivity.CUDA)
+
+    with torch.random.fork_rng(devices=[] if device_type != "cuda" else None):
+        torch.manual_seed(0)
+        if device_type == "cuda":
+            torch.cuda.manual_seed_all(0)
+        with torch.no_grad(), torch.profiler.profile(activities=activities) as prof:
+            loss, output_dict = policy.forward(reference_batch)
+
+    operators = sorted(
+        (
+            {
+                "key": e.key,
+                "count": e.count,
+                "cpu_time_total_us": _stable_float(getattr(e, "cpu_time_total", None)),
+                **(
+                    {"self_cuda_time_total_us": _get_profiler_device_time_us(e)}
+                    if device_type == "cuda"
+                    else {}
+                ),
+            }
+            for e in prof.key_averages()
+        ),
+        key=lambda e: e["key"],
+    )
+    outputs = {"loss": _summarize_value(loss), "output_dict": _summarize_value(output_dict)}
+    payload = {
+        "seed": 0,
+        "reference_batch_size": batch_size,
+        "operator_fingerprint": _hash_payload([(o["key"], o["count"]) for o in operators]),
+        "output_fingerprint": _hash_payload(outputs),
+        "operators": operators,
+        "outputs": outputs,
+    }
+    output_dir.mkdir(parents=True, exist_ok=True)
+    (output_dir / "deterministic_forward.json").write_text(json.dumps(payload, indent=2, sort_keys=True))
+    sort_by = "self_cuda_time_total" if device_type == "cuda" else "cpu_time_total"
+    _write_profiler_table(prof, output_dir / "deterministic_forward_ops.txt", sort_by=sort_by)
+
+
+class TrainingProfiler:
+    """Self-contained profiling hooks for the training loop.
+
+    The training script interacts via ``start()``, ``section()``, ``step()``,
+    ``finalize()``, and (optionally) ``record_deterministic_forward()`` — a
+    ~7-line surface.
+    """
+
+    _SCHEDULE_WAIT = 1
+    _SCHEDULE_WARMUP = 2
+    _SCHEDULE_ACTIVE = 6
+
+    def __init__(self, mode: str, output_dir: Path, device: torch.device) -> None:
+        self._mode = mode
+        self._output_dir = output_dir
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._device = device
+        # Inline timing state — no separate collector class.
+        self._total_update_s: list[float] = []
+        self._dataloading_s: list[float] = []
+        self._section_s: dict[str, list[float]] = {}
+        self._memory: list[dict[str, int]] = []
+        self._torch = self._build_torch_profiler()
+        logger.info("Profiling enabled. Artifacts will be written to %s", output_dir)
+
+    def _build_torch_profiler(self) -> Any:
+        activities = [torch.profiler.ProfilerActivity.CPU]
+        if self._device.type == "cuda":
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        trace_dir = self._output_dir / "torch_traces"
+        trace_dir.mkdir(parents=True, exist_ok=True)
+
+        def _on_trace_ready(p: Any) -> None:
+            if self._mode == "trace":
+                p.export_chrome_trace(str(trace_dir / f"trace_step_{p.step_num}.json"))
+
+        return torch.profiler.profile(
+            activities=activities,
+            schedule=torch.profiler.schedule(
+                wait=self._SCHEDULE_WAIT,
+                warmup=self._SCHEDULE_WARMUP,
+                active=self._SCHEDULE_ACTIVE,
+                repeat=1,
+            ),
+            on_trace_ready=_on_trace_ready,
+            record_shapes=True,
+            profile_memory=True,
+            with_flops=True,
+        )
+
+    @classmethod
+    def from_cfg(cls, cfg: Any, device: torch.device) -> TrainingProfiler:
+        output = cfg.profile_output_dir or (Path(cfg.output_dir) / "profiling")
+        return cls(mode=cfg.profile_mode, output_dir=Path(output), device=device)
+
+    def record_deterministic_forward(
+        self, *, policy: Any, dataset: Any, batch_size: int, preprocessor: Any
+    ) -> None:
+        logger.info("Recording deterministic forward-pass artifacts")
+        write_deterministic_forward_artifacts(
+            policy=policy,
+            dataset=dataset,
+            batch_size=batch_size,
+            preprocessor=preprocessor,
+            output_dir=self._output_dir,
+            device_type=self._device.type,
+        )
+        if self._device.type == "cuda":
+            torch.cuda.empty_cache()
+
+    def start(self) -> None:
+        if self._device.type == "cuda":
+            torch.cuda.reset_peak_memory_stats(self._device)
+        self._torch.__enter__()
+
+    @contextmanager
+    def section(self, name: str) -> Iterator[None]:
+        """Time a region of the training step. Syncs on CUDA so the
+        duration reflects GPU work, not just kernel-launch latency."""
+        if self._device.type == "cuda":
+            torch.cuda.synchronize(self._device)
+        t0 = time.perf_counter()
+        try:
+            yield
+        finally:
+            if self._device.type == "cuda":
+                torch.cuda.synchronize(self._device)
+            self._section_s.setdefault(name, []).append(time.perf_counter() - t0)
+
+    def step(self, step_num: int, train_tracker: Any) -> None:
+        self._total_update_s.append(_as_float(train_tracker.update_s))
+        self._dataloading_s.append(_as_float(train_tracker.dataloading_s))
+        if self._device.type == "cuda":
+            self._memory.append(
+                {
+                    "step": step_num,
+                    "allocated_bytes": torch.cuda.memory_allocated(self._device),
+                    "reserved_bytes": torch.cuda.memory_reserved(self._device),
+                }
+            )
+        self._torch.step()
+
+    def finalize(self) -> None:
+        self._torch.__exit__(None, None, None)
+        payload: dict[str, Any] = {
+            "profile_mode": self._mode,
+            "total_update_s": _summary(self._total_update_s),
+            "dataloading_s": _summary(self._dataloading_s),
+            "memory_timeline": self._memory,
+        }
+        for name, values in self._section_s.items():
+            payload[f"{name}_s"] = _summary(values)
+        if self._device.type == "cuda":
+            payload["peak_memory_allocated_bytes"] = torch.cuda.max_memory_allocated(self._device)
+            payload["peak_memory_reserved_bytes"] = torch.cuda.max_memory_reserved(self._device)
+        (self._output_dir / "step_timing_summary.json").write_text(
+            json.dumps(payload, indent=2, sort_keys=True)
+        )
+
+        tables_dir = self._output_dir / "torch_tables"
+        tables_dir.mkdir(parents=True, exist_ok=True)
+        _write_profiler_table(self._torch, tables_dir / "cpu_time_total.txt", sort_by="cpu_time_total")
+        _write_profiler_table(self._torch, tables_dir / "cpu_memory.txt", sort_by="self_cpu_memory_usage")
+        _write_profiler_table(self._torch, tables_dir / "flops.txt", sort_by="flops")
+        if self._device.type == "cuda":
+            _write_profiler_table(
+                self._torch, tables_dir / "cuda_time_total.txt", sort_by="self_cuda_time_total"
+            )
+            _write_profiler_table(
+                self._torch, tables_dir / "cuda_memory.txt", sort_by="self_cuda_memory_usage"
+            )
+
+
+# ---------------------------------------------------------------------------
+# CI orchestrator. Spawns `lerobot-train` per policy, collects the
+# artifacts, (optionally) uploads to the HF Hub results dataset.
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class UploadTarget:
+    local_path: Path
+    path_in_repo: str
+
+
+@dataclass(frozen=True)
+class UploadResult:
+    uploaded_paths: dict[str, str]
+    pr_url: str | None = None
+
+
+def _utc_timestamp_slug(now: datetime | None = None) -> str:
+    return (now or datetime.now(UTC)).strftime("%Y%m%dT%H%M%SZ")
+
+
+def _hub_file_url(repo_id: str, path_in_repo: str, *, revision: str = "main") -> str:
+    return f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path_in_repo}"
+
+
+def parse_discussion_num(pr_url: str | None) -> int | None:
+    if not pr_url:
+        return None
+    m = re.search(r"/discussions/(\d+)$", pr_url)
+    return int(m.group(1)) if m else None
+
+
+def upload_targets(
+    repo_id: str,
+    targets: list[UploadTarget],
+    *,
+    token: str | None = None,
+    commit_message: str | None = None,
+    create_pr: bool = False,
+) -> UploadResult:
+    api = HfApi(token=token)
+    commit = api.create_commit(
+        repo_id=repo_id,
+        repo_type="dataset",
+        operations=[
+            CommitOperationAdd(path_in_repo=t.path_in_repo, path_or_fileobj=str(t.local_path))
+            for t in targets
+        ],
+        commit_message=commit_message or f"Upload {len(targets)} profiling artifacts",
+        revision="main",
+        create_pr=create_pr,
+    )
+    pr_num = parse_discussion_num(commit.pr_url)
+    revision = f"refs/pr/{pr_num}" if (create_pr and pr_num) else "main"
+    return UploadResult(
+        uploaded_paths={
+            t.path_in_repo: _hub_file_url(repo_id, t.path_in_repo, revision=revision) for t in targets
+        },
+        pr_url=commit.pr_url,
+    )
+
+
+def build_train_command(policy: str, run_dir: Path, profile_mode: str) -> list[str]:
+    spec = POLICY_SPECS[policy]
+    return [
+        "uv",
+        "run",
+        "lerobot-train",
+        *spec["train_args"],
+        f"--output_dir={run_dir / 'train'}",
+        f"--steps={spec['steps']}",
+        "--eval_freq=0",
+        "--save_checkpoint=false",
+        f"--save_freq={spec['steps']}",
+        "--wandb.enable=false",
+        "--policy.push_to_hub=false",
+        "--num_workers=0",
+        "--log_freq=1",
+        f"--profile_mode={profile_mode}",
+        f"--profile_output_dir={run_dir / 'profiling'}",
+    ]
+
+
+def build_artifact_index(
+    *, repo_id: str, run_dir: Path, policy_name: str, run_id: str
+) -> tuple[dict[str, Any], dict[str, Any], list[UploadTarget], str]:
+    """Scan the run directory and categorize files into
+    (stdout/stderr, torch_tables/*, torch_traces/*, everything else under profiling/).
+    Returns (paths, urls, upload targets, row path in repo)."""
+    row_path_in_repo = f"rows/{policy_name}/{run_id}.json"
+    root = f"artifacts/{policy_name}/{run_id}"
+    paths: dict[str, Any] = {
+        "row": row_path_in_repo,
+        "profiling_files": {},
+        "torch_tables": {},
+        "trace_files": {},
+    }
+    urls: dict[str, Any] = {
+        "row": _hub_file_url(repo_id, row_path_in_repo),
+        "profiling_files": {},
+        "torch_tables": {},
+        "trace_files": {},
+    }
+    targets: list[UploadTarget] = []
+
+    for name in ("stdout.txt", "stderr.txt"):
+        p = run_dir / name
+        if p.exists():
+            key = name.removesuffix(".txt")
+            repo = f"{root}/{name}"
+            paths[key] = repo
+            urls[key] = _hub_file_url(repo_id, repo)
+            targets.append(UploadTarget(p, repo))
+
+    profiling_dir = run_dir / "profiling"
+    if profiling_dir.exists():
+        for p in sorted(profiling_dir.rglob("*")):
+            if not p.is_file():
+                continue
+            rel = str(p.relative_to(run_dir))
+            repo = f"{root}/{rel}"
+            paths["profiling_files"][rel] = repo
+            urls["profiling_files"][rel] = _hub_file_url(repo_id, repo)
+            targets.append(UploadTarget(p, repo))
+            if p.name == "step_timing_summary.json":
+                paths["step_timing_summary"] = repo
+                urls["step_timing_summary"] = _hub_file_url(repo_id, repo)
+            elif "torch_tables" in p.parts:
+                paths["torch_tables"][p.name] = repo
+                urls["torch_tables"][p.name] = _hub_file_url(repo_id, repo)
+            elif "torch_traces" in p.parts:
+                paths["trace_files"][p.name] = repo
+                urls["trace_files"][p.name] = _hub_file_url(repo_id, repo)
+
+    return paths, urls, targets, row_path_in_repo
+
+
+def upload_profile_run(
+    *,
+    repo_id: str,
+    row_path: Path,
+    row_path_in_repo: str,
+    artifact_targets: list[UploadTarget],
+    create_pr: bool = False,
+) -> UploadResult:
+    return upload_targets(
+        repo_id=repo_id,
+        targets=[*artifact_targets, UploadTarget(row_path, row_path_in_repo)],
+        commit_message=f"Add model profiling row {row_path_in_repo}",
+        create_pr=create_pr,
+    )
+
+
+def _load_json(path: Path) -> dict[str, Any]:
+    return json.loads(path.read_text()) if path.exists() else {}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--policies", nargs="*", default=None)
+    parser.add_argument("--output_dir", type=Path, required=True)
+    parser.add_argument("--hub_org", default="lerobot")
+    parser.add_argument("--results_repo", default="model-profiling-history")
+    parser.add_argument("--publish", action="store_true")
+    parser.add_argument("--profile_mode", choices=["summary", "trace"], default="trace")
+    parser.add_argument("--git_commit", default="")
+    parser.add_argument("--git_ref", default="")
+    parser.add_argument("--pr_number", default="")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    selected = args.policies or list(POLICY_SPECS)
+    unknown = sorted(set(selected) - set(POLICY_SPECS))
+    if unknown:
+        raise ValueError(f"Unknown profiling policies: {', '.join(unknown)}")
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    repo_id = args.results_repo if "/" in args.results_repo else f"{args.hub_org}/{args.results_repo}"
+    git_exe = shutil.which("git")
+    if not git_exe:
+        raise RuntimeError("git not found in PATH")
+    git_commit = args.git_commit or subprocess.check_output([git_exe, "rev-parse", "HEAD"], text=True).strip()
+    pr_number = int(args.pr_number) if str(args.pr_number).strip() else None
+    exit_code = 0
+
+    for policy in selected:
+        run_id = f"{_utc_timestamp_slug()}__{policy}"
+        run_dir = args.output_dir / policy / run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+        cmd = build_train_command(policy, run_dir, args.profile_mode)
+
+        t0 = time.perf_counter()
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        wall_s = time.perf_counter() - t0
+
+        (run_dir / "stdout.txt").write_text(result.stdout)
+        (run_dir / "stderr.txt").write_text(result.stderr)
+        if result.returncode != 0:
+            exit_code = 1
+
+        paths, urls, upload_list, row_in_repo = build_artifact_index(
+            repo_id=repo_id, run_dir=run_dir, policy_name=policy, run_id=run_id
+        )
+        row: dict[str, Any] = {
+            "schema_version": 1,
+            "created_at": datetime.now(UTC).isoformat(),
+            "run_id": run_id,
+            "policy": policy,
+            "git_commit": git_commit,
+            "git_ref": args.git_ref or None,
+            "pr_number": pr_number,
+            "status": "success" if result.returncode == 0 else "failed",
+            "return_code": result.returncode,
+            "profile_mode": args.profile_mode,
+            "wall_time_s": wall_s,
+            "spec": {
+                "steps": POLICY_SPECS[policy]["steps"],
+                "train_args": POLICY_SPECS[policy]["train_args"],
+            },
+            "step_timing_summary": _load_json(run_dir / "profiling" / "step_timing_summary.json"),
+            "deterministic_forward": _load_json(run_dir / "profiling" / "deterministic_forward.json"),
+            "artifact_paths": paths,
+            "artifact_urls": urls,
+            "stderr_tail": result.stderr.splitlines()[-20:],
+        }
+
+        row_path = run_dir / "profiling_row.json"
+        row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+
+        if args.publish:
+            try:
+                uploaded = upload_profile_run(
+                    repo_id=repo_id,
+                    row_path=row_path,
+                    row_path_in_repo=row_in_repo,
+                    artifact_targets=upload_list,
+                    create_pr=pr_number is not None,
+                )
+            except HfHubHTTPError as exc:
+                row.update({"publish_status": "failed", "publish_error": str(exc)})
+            else:
+                row.update(
+                    {
+                        "publish_status": "success",
+                        "uploaded_paths": uploaded.uploaded_paths,
+                        "publish_pr_url": uploaded.pr_url,
+                        "publish_pr_number": parse_discussion_num(uploaded.pr_url),
+                    }
+                )
+            row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+
+        print(json.dumps(row, indent=2, sort_keys=True))
+
+    return exit_code
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper.
+
+These tests mock out the SAPIEN-based RoboTwin runtime (task modules +
+YAML config loader) so they run without the full RoboTwin installation
+(SAPIEN, CuRobo, mplib, asset downloads, etc.).
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch
+
+import gymnasium as gym
+import numpy as np
+import pytest
+
+from lerobot.envs.robotwin import (
+    ACTION_DIM,
+    ROBOTWIN_CAMERA_NAMES,
+    ROBOTWIN_TASKS,
+    RoboTwinEnv,
+    create_robotwin_envs,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures / helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_task_env(
+    height: int = 240,
+    width: int = 320,
+    cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES,
+) -> MagicMock:
+    """Return a mock that mimics the RoboTwin task class API.
+
+    RoboTwin's real get_obs returns
+        {"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...}
+    so the mock follows the same nested shape.
+    """
+    obs_dict = {
+        "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
+        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
+        "endpose": {},
+    }
+
+    mock = MagicMock()
+    mock.get_obs.return_value = obs_dict
+    mock.setup_demo.return_value = None
+    mock.take_action.return_value = None
+    mock.eval_success = False
+    mock.check_success.return_value = False
+    mock.close_env.return_value = None
+    return mock
+
+
+@contextmanager
+def _patch_runtime(mock_task_instance: MagicMock):
+    """Patch both the task-class loader and the YAML config loader so the
+    env can construct + reset without a real RoboTwin install."""
+    task_cls = MagicMock(return_value=mock_task_instance)
+    fake_setup = {
+        "head_camera_h": 240,
+        "head_camera_w": 320,
+        "left_embodiment_config": {},
+        "right_embodiment_config": {},
+        "left_robot_file": "",
+        "right_robot_file": "",
+        "dual_arm_embodied": True,
+        "render_freq": 0,
+        "task_name": "beat_block_hammer",
+        "task_config": "demo_clean",
+    }
+    with (
+        patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls),
+        patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup),
+    ):
+        yield
+
+
+# ---------------------------------------------------------------------------
+# RoboTwinEnv unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestRoboTwinEnv:
+    def test_observation_space_shape(self):
+        """observation_space should have the configured h×w×3 for every camera."""
+        h, w = 240, 320
+        env = RoboTwinEnv(
+            task_name="beat_block_hammer",
+            observation_height=h,
+            observation_width=w,
+            camera_names=["head_camera", "left_camera"],
+        )
+        pixels_space = env.observation_space["pixels"]
+        assert pixels_space["head_camera"].shape == (h, w, 3)
+        assert pixels_space["left_camera"].shape == (h, w, 3)
+        assert "right_camera" not in pixels_space
+
+    def test_action_space(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env.action_space.shape == (ACTION_DIM,)
+        assert env.action_space.dtype == np.float32
+
+    def test_reset_returns_correct_obs_keys(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            obs, info = env.reset()
+
+        assert "pixels" in obs
+        for cam in ROBOTWIN_CAMERA_NAMES:
+            assert cam in obs["pixels"], f"Missing camera '{cam}' in obs"
+        assert "agent_pos" in obs
+        assert obs["agent_pos"].shape == (ACTION_DIM,)
+        assert info["is_success"] is False
+
+    def test_reset_calls_setup_demo(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            env.reset(seed=42)
+        # setup_demo receives the full YAML-derived kwargs plus seed + is_test;
+        # we only assert the caller-provided bits.
+        assert mock_task.setup_demo.call_count == 1
+        call_kwargs = mock_task.setup_demo.call_args.kwargs
+        assert call_kwargs["seed"] == 42
+        assert call_kwargs["is_test"] is True
+
+    def test_step_returns_correct_types(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            obs, reward, terminated, truncated, info = env.step(action)
+
+        assert isinstance(obs, dict)
+        assert isinstance(reward, float)
+        assert isinstance(terminated, bool)
+        assert isinstance(truncated, bool)
+        assert isinstance(info, dict)
+
+    def test_step_wrong_action_shape_raises(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        bad_action = np.zeros(7, dtype=np.float32)  # wrong dim
+        with _patch_runtime(mock_task):
+            env.reset()
+            with pytest.raises(ValueError, match="Expected 1-D action"):
+                env.step(bad_action)
+
+    def test_success_terminates_episode(self):
+        mock_task = _make_mock_task_env()
+        mock_task.check_success.return_value = True
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            _, _, terminated, _, info = env.step(action)
+        assert terminated is True
+        assert info["is_success"] is True
+
+    def test_truncation_after_episode_length(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2)
+        action = np.zeros(ACTION_DIM, dtype=np.float32)
+        with _patch_runtime(mock_task):
+            env.reset()
+            env.step(action)  # step 1
+            _, _, _, truncated, _ = env.step(action)  # step 2 → truncated
+        assert truncated is True
+
+    def test_close_calls_close_env(self):
+        mock_task = _make_mock_task_env()
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        with _patch_runtime(mock_task):
+            env.reset()
+            env.close()
+        mock_task.close_env.assert_called_once()
+
+    def test_black_frame_for_missing_camera(self):
+        """If a camera key is absent from get_obs(), a black frame is returned."""
+        # Mock exposes only head_camera; we ask for both head_camera + left_camera.
+        mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",))
+        env = RoboTwinEnv(
+            task_name="beat_block_hammer",
+            camera_names=["head_camera", "left_camera"],
+            observation_height=10,
+            observation_width=10,
+        )
+        with _patch_runtime(mock_task):
+            obs, _ = env.reset()
+        assert obs["pixels"]["left_camera"].shape == (10, 10, 3)
+        assert obs["pixels"]["left_camera"].sum() == 0
+
+    def test_task_and_task_description_attributes(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env.task == "beat_block_hammer"
+        assert isinstance(env.task_description, str)
+
+    def test_deferred_init_env_is_none_before_reset(self):
+        env = RoboTwinEnv(task_name="beat_block_hammer")
+        assert env._env is None  # noqa: SLF001  (testing internal state)
+
+
+# ---------------------------------------------------------------------------
+# create_robotwin_envs tests
+# ---------------------------------------------------------------------------
+
+
+class TestCreateRoboTwinEnvs:
+    def test_returns_correct_structure(self):
+        mock_task = _make_mock_task_env()
+        with _patch_runtime(mock_task):
+            envs = create_robotwin_envs(
+                task="beat_block_hammer",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+        assert "beat_block_hammer" in envs
+        assert 0 in envs["beat_block_hammer"]
+        assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv)
+
+    def test_multi_task(self):
+        mock_task = _make_mock_task_env()
+        with _patch_runtime(mock_task):
+            envs = create_robotwin_envs(
+                task="beat_block_hammer,click_bell",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+        assert set(envs.keys()) == {"beat_block_hammer", "click_bell"}
+
+    def test_unknown_task_raises(self):
+        with pytest.raises(ValueError, match="Unknown RoboTwin tasks"):
+            create_robotwin_envs(
+                task="not_a_real_task",
+                n_envs=1,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+
+    def test_invalid_n_envs_raises(self):
+        with pytest.raises(ValueError, match="n_envs must be a positive int"):
+            create_robotwin_envs(
+                task="beat_block_hammer",
+                n_envs=0,
+                env_cls=gym.vector.SyncVectorEnv,
+            )
+
+
+# ---------------------------------------------------------------------------
+# ROBOTWIN_TASKS list
+# ---------------------------------------------------------------------------
+
+
+def test_task_list_not_empty():
+    assert len(ROBOTWIN_TASKS) >= 50
+
+
+def test_all_tasks_are_strings():
+    assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS)
+
+
+def test_no_duplicate_tasks():
+    assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+from huggingface_hub.errors import HfHubHTTPError
+
+from lerobot.utils import model_profiling as mp
+
+# ---------------------------------------------------------------------------
+# Policy spec matrix
+# ---------------------------------------------------------------------------
+
+
+def test_policy_specs_cover_expected_policies():
+    assert set(mp.POLICY_SPECS) == {
+        "act",
+        "diffusion",
+        "groot",
+        "multi_task_dit",
+        "pi0",
+        "pi0_fast",
+        "pi05",
+        "smolvla",
+        "wall_x",
+        "xvla",
+    }
+    # Sanity: excluded policies should stay out of the matrix.
+    for excluded in ("sac", "sarm", "tdmpc", "vqbet", "reward_classifier"):
+        assert excluded not in mp.POLICY_SPECS
+
+
+def test_pretrained_libero_specs_match_expected_camera_keys_and_normalization():
+    base_rgb_rename = (
+        '--rename_map={"observation.images.front": "observation.images.base_0_rgb", '
+        '"observation.images.wrist": "observation.images.left_wrist_0_rgb"}'
+    )
+    for name in ("pi0", "pi0_fast", "pi05"):
+        assert base_rgb_rename in mp.POLICY_SPECS[name]["train_args"]
+    assert any(
+        arg.startswith('--policy.normalization_mapping={"ACTION": "MEAN_STD"')
+        for arg in mp.POLICY_SPECS["pi05"]["train_args"]
+    )
+    assert (
+        '--rename_map={"observation.images.front": "observation.images.camera1", '
+        '"observation.images.wrist": "observation.images.camera2"}'
+        in mp.POLICY_SPECS["smolvla"]["train_args"]
+    )
+
+
+# ---------------------------------------------------------------------------
+# CI orchestrator helpers
+# ---------------------------------------------------------------------------
+
+
+def test_build_train_command_includes_profiling_outputs(tmp_path):
+    cmd = mp.build_train_command("act", tmp_path / "run", "trace")
+    assert cmd[:3] == ["uv", "run", "lerobot-train"]
+    assert any(a.startswith("--output_dir=") for a in cmd)
+    assert any(a.startswith("--profile_output_dir=") for a in cmd)
+    assert "--profile_mode=trace" in cmd
+    assert "--eval_freq=0" in cmd
+
+
+def test_build_artifact_index_collects_tables_and_traces(tmp_path):
+    run_dir = tmp_path / "act" / "20260415T000000Z__act"
+    profiling = run_dir / "profiling"
+    (profiling / "torch_tables").mkdir(parents=True)
+    (profiling / "torch_traces").mkdir(parents=True)
+    (profiling / "step_timing_summary.json").write_text("{}")
+    (profiling / "deterministic_forward.json").write_text(
+        json.dumps({"operator_fingerprint": "ops", "output_fingerprint": "out"})
+    )
+    (profiling / "torch_tables" / "cpu_time_total.txt").write_text("cpu table")
+    (profiling / "torch_traces" / "trace_step_9.json").write_text("{}")
+    (run_dir / "stdout.txt").write_text("stdout")
+    (run_dir / "stderr.txt").write_text("stderr")
+
+    paths, urls, targets, row_in_repo = mp.build_artifact_index(
+        repo_id="lerobot/model-profiling-history",
+        run_dir=run_dir,
+        policy_name="act",
+        run_id="20260415T000000Z__act",
+    )
+
+    assert row_in_repo == "rows/act/20260415T000000Z__act.json"
+    assert paths["stdout"].endswith("/stdout.txt")
+    assert paths["step_timing_summary"].endswith("/profiling/step_timing_summary.json")
+    assert "cpu_time_total.txt" in paths["torch_tables"]
+    assert "trace_step_9.json" in paths["trace_files"]
+    assert urls["row"].startswith("https://huggingface.co/datasets/lerobot/model-profiling-history/")
+    # stdout + stderr + 4 profiling files
+    assert len(targets) == 6
+
+
+def test_upload_targets_batches_preview_publish_into_single_hf_pr(monkeypatch, tmp_path):
+    local_path = tmp_path / "profiling_row.json"
+    local_path.write_text("{}")
+    captured: dict[str, object] = {}
+
+    class _FakeCommit:
+        pr_url = "https://huggingface.co/datasets/lerobot/model-profiling-history/discussions/42"
+
+    class _FakeApi:
+        def __init__(self, token=None):
+            captured["token"] = token
+
+        def create_commit(self, **kwargs):
+            captured.update(kwargs)
+            return _FakeCommit()
+
+    monkeypatch.setattr(mp, "HfApi", _FakeApi)
+
+    result = mp.upload_targets(
+        repo_id="lerobot/model-profiling-history",
+        targets=[mp.UploadTarget(local_path, "rows/act/run.json")],
+        create_pr=True,
+        token="hf_test_token",
+    )
+
+    assert captured["repo_id"] == "lerobot/model-profiling-history"
+    assert captured["repo_type"] == "dataset"
+    assert captured["create_pr"] is True
+    assert result.pr_url == _FakeCommit.pr_url
+    assert result.uploaded_paths["rows/act/run.json"].endswith("/resolve/refs/pr/42/rows/act/run.json")
+
+
+def test_parse_discussion_num_handles_hf_discussion_urls():
+    assert (
+        mp.parse_discussion_num(
+            "https://huggingface.co/datasets/lerobot/model-profiling-history/discussions/42"
+        )
+        == 42
+    )
+    assert mp.parse_discussion_num("https://huggingface.co/datasets/lerobot/model-profiling-history") is None
+    assert mp.parse_discussion_num(None) is None
+
+
+# ---------------------------------------------------------------------------
+# main() smoke tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def fake_args(tmp_path):
+    """Shared argparse namespace for main() smoke tests — overridden per-test."""
+    return argparse.Namespace(
+        policies=["act"],
+        output_dir=tmp_path / "results",
+        hub_org="lerobot",
+        results_repo="model-profiling-history",
+        publish=False,
+        profile_mode="summary",
+        git_commit="",
+        git_ref="codex/model-profiling",
+        pr_number="3389",
+    )
+
+
+def _stub_train_subprocess(mp_module, *, returncode: int = 0, write_artifacts: bool = True):
+    """Build a fake subprocess.run that writes the profiling artifacts main() expects."""
+
+    def _fake_run(cmd, capture_output, text):
+        assert capture_output is True
+        assert text is True
+        profile_dir = Path(next(a.split("=", 1)[1] for a in cmd if a.startswith("--profile_output_dir=")))
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        if write_artifacts:
+            (profile_dir / "torch_tables").mkdir(parents=True, exist_ok=True)
+            (profile_dir / "step_timing_summary.json").write_text(
+                json.dumps({"total_update_s": {"count": 1, "mean": 0.3}, "peak_memory_allocated_bytes": 1024})
+            )
+            (profile_dir / "deterministic_forward.json").write_text(
+                json.dumps(
+                    {"operator_fingerprint": "ops-fingerprint", "output_fingerprint": "output-fingerprint"}
+                )
+            )
+            (profile_dir / "torch_tables" / "cpu_time_total.txt").write_text("cpu time table")
+        return subprocess.CompletedProcess(cmd, returncode, "stdout ok", "")
+
+    return _fake_run
+
+
+def test_main_smoke_writes_row(monkeypatch, fake_args):
+    monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
+    monkeypatch.setattr(mp.subprocess, "check_output", lambda *a, **k: "deadbeef\n")
+    monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp))
+
+    assert mp.main() == 0
+
+    row_paths = list(fake_args.output_dir.rglob("profiling_row.json"))
+    assert len(row_paths) == 1
+    row = json.loads(row_paths[0].read_text())
+    assert row["policy"] == "act"
+    assert row["status"] == "success"
+    assert row["git_commit"] == "deadbeef"
+    assert row["git_ref"] == "codex/model-profiling"
+    assert row["pr_number"] == 3389
+    assert row["step_timing_summary"]["total_update_s"]["mean"] == 0.3
+    assert row["deterministic_forward"]["operator_fingerprint"] == "ops-fingerprint"
+
+
+def test_main_records_publish_failure_without_failing(monkeypatch, fake_args):
+    fake_args.publish = True
+    fake_args.git_commit = "deadbeef"
+    monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
+    monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp, write_artifacts=False))
+
+    def _fail_upload(**kwargs):
+        resp = type("Resp", (), {"status_code": 403, "headers": {}, "request": None})()
+        raise HfHubHTTPError("403 Forbidden: Authorization error.", response=resp)
+
+    monkeypatch.setattr(mp, "upload_profile_run", _fail_upload)
+
+    assert mp.main() == 0
+    row = json.loads(next(fake_args.output_dir.rglob("profiling_row.json")).read_text())
+    assert row["status"] == "success"
+    assert row["publish_status"] == "failed"
+    assert "Authorization error" in row["publish_error"]
+
+
+def test_main_returns_nonzero_when_training_subprocess_fails(monkeypatch, fake_args):
+    monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
+    monkeypatch.setattr(mp.subprocess, "check_output", lambda *a, **k: "deadbeef\n")
+    monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp, returncode=3))
+
+    assert mp.main() == 1
+
+    row = json.loads(next(fake_args.output_dir.rglob("profiling_row.json")).read_text())
+    assert row["status"] == "failed"
+    assert row["return_code"] == 3
+
+
+# ---------------------------------------------------------------------------
+# TrainingProfiler behavior
+# ---------------------------------------------------------------------------
+
+
+def test_deterministic_forward_artifacts_preserve_policy_mode(tmp_path):
+    class _TrainingOnlyPolicy(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.forward_calls = 0
+
+        def forward(self, batch):
+            self.forward_calls += 1
+            assert self.training
+            return batch["value"].sum(), {"value": batch["value"]}
+
+    dataset = [{"value": torch.tensor([1.0, 2.0])}]
+    policy = _TrainingOnlyPolicy()
+    policy.train()
+
+    mp.write_deterministic_forward_artifacts(
+        policy=policy,
+        dataset=dataset,
+        batch_size=2,
+        preprocessor=lambda b: b,
+        output_dir=tmp_path,
+        device_type="cpu",
+    )
+
+    payload = json.loads((tmp_path / "deterministic_forward.json").read_text())
+    assert policy.training is True
+    assert policy.forward_calls == 1
+    assert payload["reference_batch_size"] == 2
+    assert "operator_fingerprint" in payload
+    assert payload["outputs"]["loss"]["numel"] == 1
+
+
+def test_deterministic_forward_artifacts_infers_image_keys_without_dataset_meta(tmp_path):
+    class _ImagePolicy(torch.nn.Module):
+        def forward(self, batch):
+            image = batch["observation.images.front"]
+            assert image.dtype == torch.float32
+            assert torch.all((image >= 0.0) & (image <= 1.0))
+            return image.sum(), {"image": image}
+
+    dataset = [{"observation.images.front": torch.tensor([[[0, 255]]], dtype=torch.uint8)}]
+
+    mp.write_deterministic_forward_artifacts(
+        policy=_ImagePolicy(),
+        dataset=dataset,
+        batch_size=1,
+        preprocessor=lambda b: b,
+        output_dir=tmp_path,
+        device_type="cpu",
+    )
+
+    payload = json.loads((tmp_path / "deterministic_forward.json").read_text())
+    assert payload["outputs"]["loss"]["numel"] == 1
+    assert payload["outputs"]["output_dict"]["image"]["dtype"] == "torch.float32"
+
+
+def test_training_profiler_section_records_forward_backward_optimizer(tmp_path):
+    profiler = mp.TrainingProfiler(mode="summary", output_dir=tmp_path, device=torch.device("cpu"))
+    profiler.start()
+    for _ in range(3):
+        with profiler.section("forward"):
+            pass
+        with profiler.section("backward"):
+            pass
+        with profiler.section("optimizer"):
+            pass
+    profiler.step(1, argparse.Namespace(update_s=0.5, dataloading_s=0.01))
+    profiler.finalize()
+
+    payload = json.loads((tmp_path / "step_timing_summary.json").read_text())
+    assert payload["forward_s"]["count"] == 3
+    assert payload["backward_s"]["count"] == 3
+    assert payload["optimizer_s"]["count"] == 3
+    assert payload["total_update_s"]["mean"] == 0.5
+
+
+def test_training_profiler_accepts_metric_like_values(tmp_path):
+    class _MetricLike:
+        def __init__(self, v):
+            self.val = v
+
+    profiler = mp.TrainingProfiler(mode="summary", output_dir=tmp_path, device=torch.device("cpu"))
+    profiler.start()
+    profiler.step(1, argparse.Namespace(update_s=_MetricLike(0.6), dataloading_s=_MetricLike(0.05)))
+    profiler.finalize()
+
+    payload = json.loads((tmp_path / "step_timing_summary.json").read_text())
+    assert payload["total_update_s"]["mean"] == 0.6
+    assert payload["dataloading_s"]["mean"] == 0.05
+
+
+def test_profiler_device_time_uses_generic_attr_first():
+    class _Event:
+        self_device_time_total = 12.3456
+
+    assert mp._get_profiler_device_time_us(_Event()) == 12.3456
@@ -0,0 +1,232 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for the RoboMME env wrapper and config.
+
+RoboMME requires Linux + ManiSkill (Vulkan/SAPIEN), so tests that touch the
+env wrapper mock the ``robomme`` package. Tests that only exercise the
+dataclass config run without any mocking.
+"""
+
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock
+
+import numpy as np
+
+
+def _install_robomme_stub():
+    """Register a minimal stub for the ``robomme`` package on sys.modules."""
+    stub = ModuleType("robomme")
+    wrapper_stub = ModuleType("robomme.env_record_wrapper")
+
+    class FakeBuilder:
+        def __init__(self, **kwargs):
+            pass
+
+        def make_env_for_episode(self, episode_idx: int, max_steps: int):
+            env = MagicMock()
+            obs = {
+                "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "joint_state_list": [np.zeros(7, dtype=np.float32)],
+                "gripper_state_list": [np.zeros(2, dtype=np.float32)],
+            }
+            env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
+            env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
+            return env
+
+    wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
+    stub.env_record_wrapper = wrapper_stub
+    sys.modules["robomme"] = stub
+    sys.modules["robomme.env_record_wrapper"] = wrapper_stub
+
+
+def _uninstall_robomme_stub():
+    sys.modules.pop("robomme", None)
+    sys.modules.pop("robomme.env_record_wrapper", None)
+
+
+# ---------------------------------------------------------------------------
+# Config tests (no sim required)
+# ---------------------------------------------------------------------------
+
+
+def test_robomme_env_config_defaults():
+    from lerobot.envs.configs import RoboMMEEnv
+
+    cfg = RoboMMEEnv()
+    assert cfg.task == "PickXtimes"
+    assert cfg.fps == 10
+    assert cfg.episode_length == 300
+    assert cfg.action_space == "joint_angle"
+    assert cfg.dataset_split == "test"
+    assert cfg.task_ids is None
+
+
+def test_robomme_env_config_type():
+    from lerobot.envs.configs import RoboMMEEnv
+
+    cfg = RoboMMEEnv()
+    assert cfg.type == "robomme"
+
+
+def test_robomme_features_map():
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
+
+    cfg = RoboMMEEnv()
+    assert cfg.features_map[ACTION] == ACTION
+    assert cfg.features_map["pixels/image"] == f"{OBS_IMAGES}.image"
+    assert cfg.features_map["pixels/wrist_image"] == f"{OBS_IMAGES}.wrist_image"
+    assert cfg.features_map["agent_pos"] == OBS_STATE
+
+
+def test_robomme_features_action_dim_joint_angle():
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION
+
+    cfg = RoboMMEEnv(action_space="joint_angle")
+    assert cfg.features[ACTION].shape == (8,)
+
+
+def test_robomme_features_action_dim_ee_pose():
+    """`ee_pose` uses a 7-D action; __post_init__ sets the correct shape."""
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION
+
+    cfg = RoboMMEEnv(action_space="ee_pose")
+    assert cfg.features[ACTION].shape == (7,)
+
+
+# ---------------------------------------------------------------------------
+# Obs conversion (pure Python, no sim)
+# ---------------------------------------------------------------------------
+
+
+def test_convert_obs_list_format():
+    """_convert_obs takes the last element from list-format obs fields and
+    emits a nested ``pixels`` dict (image, wrist_image) plus ``agent_pos``.
+
+    The nested layout is required so ``preprocess_observation()`` in
+    ``envs/utils.py`` maps each camera to ``observation.images.<cam>``.
+    """
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import RoboMMEGymEnv
+
+        env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
+
+        front = np.full((256, 256, 3), 42, dtype=np.uint8)
+        wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
+        joints = np.arange(7, dtype=np.float32)
+        gripper = np.array([0.5, 0.5], dtype=np.float32)
+
+        obs_raw = {
+            "front_rgb_list": [np.zeros_like(front), front],
+            "wrist_rgb_list": [np.zeros_like(wrist), wrist],
+            "joint_state_list": [np.zeros(7, dtype=np.float32), joints],
+            "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
+        }
+
+        result = env._convert_obs(obs_raw)
+        np.testing.assert_array_equal(result["pixels"]["image"], front)
+        np.testing.assert_array_equal(result["pixels"]["wrist_image"], wrist)
+        assert result["agent_pos"].shape == (8,)
+        np.testing.assert_array_almost_equal(result["agent_pos"][:7], joints)
+        assert result["agent_pos"][7] == gripper[0]
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_convert_obs_array_format():
+    """_convert_obs also handles non-list (direct array) obs."""
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import RoboMMEGymEnv
+
+        env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
+
+        front = np.zeros((256, 256, 3), dtype=np.uint8)
+        obs_raw = {
+            "front_rgb_list": front,
+            "wrist_rgb_list": front,
+            "joint_state_list": np.zeros(7, dtype=np.float32),
+            "gripper_state_list": np.zeros(2, dtype=np.float32),
+        }
+        result = env._convert_obs(obs_raw)
+        assert result["pixels"]["image"].shape == (256, 256, 3)
+        assert result["pixels"]["wrist_image"].shape == (256, 256, 3)
+        assert result["agent_pos"].shape == (8,)
+    finally:
+        _uninstall_robomme_stub()
+
+
+# ---------------------------------------------------------------------------
+# create_robomme_envs (mocked sim)
+# ---------------------------------------------------------------------------
+
+
+def test_create_robomme_envs_returns_correct_structure():
+    """Single task -> {task_name: {task_id: VectorEnv}} with one entry per task_id."""
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = MagicMock(return_value=MagicMock())
+        result = create_robomme_envs(
+            task="PickXtimes",
+            n_envs=1,
+            task_ids=[0, 1],
+            env_cls=env_cls,
+        )
+
+        assert "PickXtimes" in result
+        assert 0 in result["PickXtimes"]
+        assert 1 in result["PickXtimes"]
+        assert env_cls.call_count == 2
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_create_robomme_envs_multi_task():
+    """Comma-separated task list produces one suite per task."""
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = MagicMock(return_value=MagicMock())
+        result = create_robomme_envs(
+            task="PickXtimes,BinFill,StopCube",
+            n_envs=1,
+            env_cls=env_cls,
+        )
+
+        assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_create_robomme_envs_raises_on_invalid_env_cls():
+    _install_robomme_stub()
+    try:
+        import pytest
+
+        from lerobot.envs.robomme import create_robomme_envs
+
+        with pytest.raises(ValueError, match="env_cls must be a callable"):
+            create_robomme_envs(task="PickXtimes", n_envs=1, env_cls=None)
+    finally:
+        _uninstall_robomme_stub()