Add chained SLURM mirror-and-double dataset script.

Provide a standalone DataTrove workflow that mirrors bimanual shards, aggregates mirrored output, builds a doubled dataset, and optionally pushes the final dataset to the Hub. Made-with: Cursor
2026-06-18 00:37:10 +00:00 · 2026-02-27 11:13:17 +00:00
53 changed files with 1127 additions and 3370 deletions
@@ -61,7 +61,6 @@ jobs:
      MUJOCO_GL: egl
      HF_HOME: /mnt/cache/.cache/huggingface
      HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
-      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
    steps:
      - uses: actions/checkout@v6
        with:
@@ -90,10 +89,5 @@ jobs:
      - name: Install lerobot with test extras
        run: uv sync --extra "test"

-      - name: Login to Hugging Face
-        run: |
-          uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
-          uv run hf auth whoami
-
      - name: Run pytest
        run: uv run pytest tests -vv --maxfail=10
@@ -60,7 +60,6 @@ jobs:
      MUJOCO_GL: egl
      HF_HOME: /mnt/cache/.cache/huggingface
      HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
-      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
    steps:
      - uses: actions/checkout@v6
        with:
@@ -88,11 +87,6 @@ jobs:
      - name: Install lerobot with all extras
        run: uv sync --extra all # TODO(Steven): Make flash-attn optional

-      - name: Login to Hugging Face
-        run: |
-          uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
-          uv run hf auth whoami
-
      - name: Run pytest (all extras)
        run: uv run pytest tests -vv --maxfail=10

@@ -168,7 +162,6 @@ jobs:
      HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
      TORCH_HOME: /home/user_lerobot/.cache/torch
      TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
-      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
    container:
      image: ${{ needs.build-and-push-docker.outputs.image_tag }} # zizmor: ignore[unpinned-images]
      options: --gpus all --shm-size "16gb"
@@ -180,10 +173,6 @@ jobs:
        shell: bash
        working-directory: /lerobot
    steps:
-      - name: Login to Hugging Face
-        run: |
-          hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
-          hf auth whoami
      - name: Run pytest on GPU
        run: pytest tests -vv --maxfail=10
      - name: Run end-to-end tests
@@ -29,8 +29,6 @@
    title: Using the Dataset Tools
  - local: dataset_subtask
    title: Using Subtasks in the Dataset
-  - local: streaming_video_encoding
-    title: Streaming Video Encoding
  title: "Datasets"
 - sections:
  - local: act
@@ -88,8 +88,5 @@ lerobot-record \
  --dataset.repo_id=${HF_USER}/eval_act_your_dataset \
  --dataset.num_episodes=10 \
  --dataset.single_task="Your task description" \
-  --dataset.streaming_encoding=true \
-  --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
  --policy.path=${HF_USER}/act_policy
 ```
@@ -192,9 +192,6 @@ lerobot-record \
    --dataset.num_episodes=2 \
    --dataset.fps=10 \
    --dataset.single_task="Navigate around obstacles" \
-    --dataset.streaming_encoding=true \
-    --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
    --display_data=true
 ```

@@ -120,12 +120,9 @@ lerobot-record \
  --display_data=true \
  --dataset.repo_id=<user>/eval_groot-bimanual  \
  --dataset.num_episodes=10 \
-  --dataset.single_task="Grab and handover the red cube to the other arm" \
-  --dataset.streaming_encoding=true \
-  --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
-  --policy.path=<user>/groot-bimanual \ # your trained model
-  --dataset.episode_time_s=30 \
+  --dataset.single_task="Grab and handover the red cube to the other arm"
+  --policy.path=<user>/groot-bimanual # your trained model
+  --dataset.episode_time_s=30
  --dataset.reset_time_s=10
 ```

@@ -230,9 +230,6 @@ lerobot-record \
    --dataset.episode_time_s=5 \
    --dataset.push_to_hub=true \
    --dataset.private=true \
-    --dataset.streaming_encoding=true \
-    --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
    --display_data=true
 ```

@@ -276,8 +273,5 @@ lerobot-record \
  --dataset.repo_id=<USER>/eval_hopejr \
  --dataset.single_task="Evaluate hopejr hand policy" \
  --dataset.num_episodes=10 \
-  --dataset.streaming_encoding=true \
-  --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
  --policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
 ```
@@ -165,7 +165,7 @@ huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
 Then store your Hugging Face repository name in a variable:

 ```bash
-HF_USER=$(hf auth whoami | awk -F': *' 'NR==1 {print $2}')
+HF_USER=$(hf auth whoami | head -n 1)
 echo $HF_USER
 ```

@@ -185,10 +185,7 @@ lerobot-record \
    --display_data=true \
    --dataset.repo_id=${HF_USER}/record-test \
    --dataset.num_episodes=5 \
-    --dataset.single_task="Grab the black cube" \
-    --dataset.streaming_encoding=true \
-    # --dataset.vcodec=auto \
-    --dataset.encoder_threads=2
+    --dataset.single_task="Grab the black cube"
 ```
 </hfoption>
 <hfoption id="API example">
@@ -518,9 +515,6 @@ lerobot-record  \
  --display_data=false \
  --dataset.repo_id=${HF_USER}/eval_so100 \
  --dataset.single_task="Put lego brick into the transparent box" \
-  --dataset.streaming_encoding=true \
-  --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
  # <- Teleop optional if you want to teleoperate in between episodes \
  # --teleop.type=so100_leader \
  # --teleop.port=/dev/ttyACM0 \
@@ -40,13 +40,6 @@ conda install ffmpeg -c conda-forge
 >
 > - _[On Linux only]_ If you want to bring your own ffmpeg: Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.

-> [!NOTE]
-> When installing LeRobot inside WSL (Windows Subsystem for Linux), make sure to install `evdev` with the following command:
->
-> ```bash
-> conda install evdev -c conda-forge
-> ```
-
 ## Step 3: Install LeRobot 🤗

 ### From Source
@@ -41,10 +41,7 @@ lerobot-record \
  --display_data=true \
  --dataset.repo_id=${HF_USER}/record-test \
  --dataset.num_episodes=5 \
-  --dataset.single_task="Grab the black cube" \
-  --dataset.streaming_encoding=true \
-  # --dataset.vcodec=auto \
-  --dataset.encoder_threads=2
+  --dataset.single_task="Grab the black cube"
 ```

 See the [recording guide](./il_robots#record-a-dataset) for more details.
@@ -66,13 +66,12 @@ Run on of the examples scripts to teleoperate, record a dataset, replay a datase

 All scripts assume you configured your robot (e.g., SO-100 follower) and set the correct serial port.

-Additionally you need to **copy the URDF of the robot into the examples folder**. For the examples in this tutorial (using SO100/SO101), copy the `SO101` folder from the [SO-ARM100 repo](https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101) into the `examples/phone_to_so100/` directory, so that the URDF file path becomes `examples/phone_to_so100/SO101/so101_new_calib.urdf`.
+Additionally you need to **copy the urdf of the robot to the examples folder**. For the examples in this tutorial (Using SO100/SO101) it is highly recommended to use the urdf in the [SO-ARM100 repo](https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf)

 - Run this example to teleoperate:

  ```bash
-  cd examples/phone_to_so100
-  python teleoperate.py
+  python examples/phone_to_so100/teleoperate.py
  ```

 After running the example:
@@ -85,22 +84,19 @@ Additionally you can customize mapping or safety limits by editing the processor
 - Run this example to record a dataset, which saves absolute end effector observations and actions:

  ```bash
-  cd examples/phone_to_so100
-  python record.py
+  python examples/phone_to_so100/record.py
  ```

 - Run this example to replay recorded episodes:

  ```bash
-  cd examples/phone_to_so100
-  python replay.py
+  python examples/phone_to_so100/replay.py
  ```

 - Run this example to evaluate a pretrained policy:

  ```bash
-  cd examples/phone_to_so100
-  python evaluate.py
+  python examples/phone_to_so100/evaluate.py
  ```

 ### Important pipeline steps and options
@@ -52,7 +52,7 @@ This approach can transform **any existing VLM** into a VLA by training it to pr

 You have two options for the FAST tokenizer:

-1. **Use the pre-trained tokenizer**: The `lerobot/fast-action-tokenizer` tokenizer was trained on 1M+ real robot action sequences and works as a general-purpose tokenizer.
+1. **Use the pre-trained tokenizer**: The `physical-intelligence/fast` tokenizer was trained on 1M+ real robot action sequences and works as a general-purpose tokenizer.

 2. **Train your own tokenizer**: For maximum performance on your specific dataset, you can finetune the tokenizer on your own data.

@@ -114,15 +114,15 @@ lerobot-train \

 ### Key Training Parameters

-| Parameter                              | Description                                        | Default                         |
-| -------------------------------------- | -------------------------------------------------- | ------------------------------- |
-| `--policy.gradient_checkpointing=true` | Reduces memory usage significantly during training | `false`                         |
-| `--policy.dtype=bfloat16`              | Use mixed precision training for efficiency        | `float32`                       |
-| `--policy.chunk_size`                  | Number of action steps to predict (action horizon) | `50`                            |
-| `--policy.n_action_steps`              | Number of action steps to execute                  | `50`                            |
-| `--policy.max_action_tokens`           | Maximum number of FAST tokens per action chunk     | `256`                           |
-| `--policy.action_tokenizer_name`       | FAST tokenizer to use                              | `lerobot/fast-action-tokenizer` |
-| `--policy.compile_model=true`          | Enable torch.compile for faster training           | `false`                         |
+| Parameter                              | Description                                        | Default                      |
+| -------------------------------------- | -------------------------------------------------- | ---------------------------- |
+| `--policy.gradient_checkpointing=true` | Reduces memory usage significantly during training | `false`                      |
+| `--policy.dtype=bfloat16`              | Use mixed precision training for efficiency        | `float32`                    |
+| `--policy.chunk_size`                  | Number of action steps to predict (action horizon) | `50`                         |
+| `--policy.n_action_steps`              | Number of action steps to execute                  | `50`                         |
+| `--policy.max_action_tokens`           | Maximum number of FAST tokens per action chunk     | `256`                        |
+| `--policy.action_tokenizer_name`       | FAST tokenizer to use                              | `physical-intelligence/fast` |
+| `--policy.compile_model=true`          | Enable torch.compile for faster training           | `false`                      |

 ## Inference

@@ -159,9 +159,6 @@ lerobot-record \
    --dataset.fps=15 \
    --dataset.push_to_hub=true \
    --dataset.private=true \
-    --dataset.streaming_encoding=true \
-    --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
    --display_data=true
 ```

@@ -201,9 +198,6 @@ lerobot-record \
    --dataset.fps=15 \
    --dataset.push_to_hub=true \
    --dataset.private=true \
-    --dataset.streaming_encoding=true \
-    --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
    --display_data=true
 ```

@@ -106,9 +106,6 @@ lerobot-record \
  --dataset.repo_id=${HF_USER}/eval_DATASET_NAME_test \  # <- This will be the dataset name on HF Hub
  --dataset.episode_time_s=50 \
  --dataset.num_episodes=10 \
-  --dataset.streaming_encoding=true \
-  --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
  # <- Teleop optional if you want to teleoperate in between episodes \
  # --teleop.type=so100_leader \
  # --teleop.port=/dev/ttyACM0 \
@@ -1,155 +0,0 @@
-# Streaming Video Encoding Guide
-
-## 1. Overview
-
-Streaming video encoding eliminates the traditional PNG round-trip during video dataset recording. Instead of:
-
-1. Capture frame -> write PNG to disk -> (at episode end) read PNG's -> encode to MP4 -> delete PNG's
-
-Frames can be encoded in real-time during capture:
-
-1. Capture frame -> queue to encoder thread -> encode to MP4 directly
-
-This makes `save_episode()` near-instant (the video is already encoded by the time the episode ends) and removes the blocking wait that previously occurred between episodes, especially with multiple cameras in long episodes.
-
-## 2. Tuning Parameters
-
-| Parameter               | CLI Flag                          | Type          | Default       | Description                                                       |
-| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
-| `streaming_encoding`    | `--dataset.streaming_encoding`    | `bool`        | `True`        | Enable real-time encoding during capture                          |
-| `vcodec`                | `--dataset.vcodec`                | `str`         | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder                     |
-| `encoder_threads`       | `--dataset.encoder_threads`       | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
-| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int`         | `60`          | Max buffered frames per camera (~2s at 30fps). Consumes RAM       |
-
-## 3. Performance Considerations
-
-Streaming encoding means the CPU is encoding video **during** the capture loop, not after. This creates a CPU budget that must be shared between:
-
- **Control loop** (reading cameras, control the robot, writing non-video data)
- **Encoder threads** (one pool per camera)
- **Rerun visualization** (if enabled)
- **OS and other processes**
-
-### Resolution & Number of Cameras Impact
-
-| Setup                     | Throughput (px/sec) | CPU Encoding Load | Notes                          |
-| ------------------------- | ------------------- | ----------------- | ------------------------------ |
-| 2camsx 640x480x3 @30fps   | 55M                 | Low               | Works on most systems          |
-| 2camsx 1280x720x3 @30fps  | 165M                | Moderate          | Comfortable on modern systems  |
-| 2camsx 1920x1080x3 @30fps | 373M                | High              | Requires powerful high-end CPU |
-
-### `encoder_threads` Tuning
-
-This parameter controls how many threads each encoder instance uses internally:
-
- **Higher values** (e.g., 4-5): Faster encoding, but uses more CPU cores per camera. Good for high-end systems with many cores.
- **Lower values** (e.g., 1-2): Less CPU per camera, freeing cores for capture and visualization. Good for low-res images and capable CPUs.
- **`None` (default)**: Lets the codec decide. Information available in the codec logs.
-
-### Backpressure and Frame Dropping
-
-Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
-
-1. The queue fills up (consuming RAM)
-2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
-3. A warning is logged: `"Encoder queue full for {camera}, dropped N frame(s)"`
-4. At episode end, total dropped frames per camera are reported
-
-### Symptoms of Encoder Falling Behind
-
- **System feels laggy and freezes**: all CPUs are at 100%
- **Dropped frame warnings** in the log or lower frames/FPS than expected in the recorded dataset
- **Choppy robot movement**: If CPU is severely overloaded, even the capture loop may be affected
- **Accumulated rerun lag**: Visualization falls behind real-time
-
-## 4. Hardware-Accelerated Encoding
-
-### When to Use
-
-Use HW encoding when:
-
- CPU is the bottleneck (dropped frames, choppy robot, rerun lag)
- You have compatible hardware (GPU or dedicated encoder)
- You're recording at high throughput (high resolution or with many cameras)
-
-### Choosing a Codec
-
-| Codec                 | CPU Usage | File Size      | Quality | Notes                                                            |
-| --------------------- | --------- | -------------- | ------- | ---------------------------------------------------------------- |
-| `libsvtav1` (default) | High      | Smallest       | Best    | Default. Best compression but most CPU-intensive                 |
-| `h264`                | Medium    | ~30-50% larger | Good    | Software H.264. Lower CPU                                        |
-| HW encoders           | Very Low  | Largest        | Good    | Offloads to dedicated hardware. Best for CPU-constrained systems |
-
-### Available HW Encoders
-
-| Encoder             | Platform      | Hardware                                                                                         | CLI Value                            |
-| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
-| `h264_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.vcodec=h264_videotoolbox` |
-| `hevc_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.vcodec=hevc_videotoolbox` |
-| `h264_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.vcodec=h264_nvenc`        |
-| `hevc_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.vcodec=hevc_nvenc`        |
-| `h264_vaapi`        | Linux         | Intel/AMD GPU                                                                                    | `--dataset.vcodec=h264_vaapi`        |
-| `h264_qsv`          | Linux/Windows | Intel Quick Sync                                                                                 | `--dataset.vcodec=h264_qsv`          |
-| `auto`              | Any           | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto`              |
-
-> [!NOTE]
-> In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
-
-> [!NOTE]
-> `libsvtav1` is the default because it provides the best training performance; other vcodecs can reduce CPU usage and be faster, but they typically produce larger files and may affect training time.
-
-## 5. Troubleshooting
-
-| Symptom                                                            | Likely Cause                                 | Fix                                                                                                                                                                                                                                                                                  |
-| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage)                | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
-| "Encoder queue full" warnings or dropped frames in dataset         | Encoder can't keep up (Queue overflow)       | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`).                                                                                                                                                    |
-| High RAM usage                                                     | Queue filling faster than encoding           | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding                                                                                                                                                                                     |
-| Large video files                                                  | Using HW encoder or H.264                    | Expected trade-off. Switch to `libsvtav1` if CPU allows                                                                                                                                                                                                                              |
-| `save_episode()` still slow                                        | `streaming_encoding` is `False`              | Set `--dataset.streaming_encoding=true`                                                                                                                                                                                                                                              |
-| Encoder thread crash                                               | Codec not available or invalid settings      | Check `vcodec` is installed, try `--dataset.vcodec=auto`                                                                                                                                                                                                                             |
-| Recorded dataset is missing frames                                 | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected.                                   |
-
-## 6. Recommended Configurations
-
-These estimates are conservative; we recommend testing them on your setup—start with a low load and increase it gradually.
-
-### High-End Systems: modern 12+ cores (24+ threads)
-
-A throughput between ~250-500M px/sec should be comfortable in CPU. For even better results try HW encoding if available.
-
-```bash
-# 3camsx 1280x720x3 @30fps: Defaults work well. Optionally increase encoder parallelism.
-# 2camsx 1920x1080x3 @30fps: Defaults work well. Optionally increase encoder parallelism.
-lerobot-record --dataset.encoder_threads=5 ...
-
-# 3camsx 1920x1080x3 @30fps: Might require some tuning.
-```
-
-### Mid-Range Systems: modern 8+ cores (16+ threads) or Apple Silicon
-
-A throughput between ~80-300M px/sec should be possible in CPU.
-
-```bash
-# 3camsx 640x480x3 @30fps: Defaults work well. Optionally decrease encoder parallelism.
-# 2camsx 1280x720x3 @30fps: Defaults work well. Optionally decrease encoder parallelism.
-lerobot-record --dataset.encoder_threads=2 ...
-
-# 2camsx 1920x1080x3 @30fps: Might require some tuning.
-```
-
-### Low-Resource Systems: modern 4+ cores (8+ threads) or Raspberry Pi 5
-
-On very constrained systems, streaming encoding may compete too heavily with the capture loop. Disabling it falls back to the PNG-based approach where encoding happens between episodes (blocking, but doesn't interfere with capture). Alternatively, record at a lower throughput to reduce both capture and encoding load. Consider also changing codec to `h264` and using batch encoding.
-
-```bash
-# 2camsx 640x480x3 @30fps: Requires some tuning.
-
-# Use H.264, disable streaming, consider batching encoding
-lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
-```
-
-## 7. Closing note
-
-Performance ultimately depends on your exact setup — frames-per-second, resolution, CPU cores and load, available memory, episode length, and the encoder you choose. Always test with your target workload, be mindful about your CPU & system capabilities and tune `encoder_threads`, `encoder_queue_maxsize`, and
-`vcodec` reasonably. That said, a common practical configuration (for many applications) is three cameras at 640×480x3 @30fps; this usually runs fine with the default streaming video encoding settings in modern systems. Always verify your recorded dataset is healthy by comparing the video duration to the CLI episode duration and confirming the row count equals FPS × CLI duration.
@@ -229,10 +229,7 @@ lerobot-record \
    --dataset.num_episodes=2 \
    --dataset.episode_time_s=5 \
    --dataset.reset_time_s=5 \
-    --dataset.push_to_hub=true \
-    --dataset.streaming_encoding=true \
-    # --dataset.vcodec=auto \
-    --dataset.encoder_threads=2
+    --dataset.push_to_hub=true
 ```

 Example simulation dataset: [nepyope/teleop_test_sim](https://huggingface.co/datasets/nepyope/teleop_test_sim)
@@ -282,10 +279,7 @@ lerobot-record \
    --dataset.num_episodes=2 \
    --dataset.episode_time_s=5 \
    --dataset.reset_time_s=5 \
-    --dataset.push_to_hub=true \
-    --dataset.streaming_encoding=true \
-    # --dataset.vcodec=auto \
-    --dataset.encoder_threads=2
+    --dataset.push_to_hub=true
 ```

 **Note**: Update `server_address` to match your robot's camera server IP.
@@ -0,0 +1,726 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Mirror a bimanual dataset in parallel with DataTrove + SLURM, then double it.
+
+Workflow:
+1) Split source episodes across `num_shards` ranks and mirror each shard in parallel.
+2) Aggregate mirrored shards into one mirrored dataset.
+3) Aggregate [original, mirrored] into a final doubled dataset.
+
+Example:
+python examples/port_datasets/slurm_mirror_dataset.py \
+  --repo-id=pepijn/openarm_bimanual \
+  --output-repo-id=pepijn/openarm_bimanual_doubled \
+  --partition=hopper-cpu \
+  --num-shards=256 \
+  --workers=64 \
+  --cpus-per-task=8 \
+  --mem-per-cpu=4G
+"""
+
+import argparse
+import copy
+import logging
+import shutil
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from datatrove.executor import LocalPipelineExecutor
+from datatrove.executor.slurm import SlurmPipelineExecutor
+from datatrove.pipeline.base import PipelineStep
+
+from lerobot.datasets.aggregate import aggregate_datasets
+from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.datasets.utils import DEFAULT_FEATURES
+from lerobot.utils.constants import HF_LEROBOT_HOME
+from lerobot.utils.utils import init_logging
+
+logger = logging.getLogger(__name__)
+
+
+OPENARM_MIRRORING_MASK = {
+    "joint_1": -1,
+    "joint_2": -1,
+    "joint_3": -1,
+    "joint_4": 1,
+    "joint_5": -1,
+    "joint_6": -1,
+    "joint_7": -1,
+    "gripper": 1,
+}
+
+
+def get_mirroring_mask(robot_type: str | None) -> dict[str, int]:
+    if robot_type in ["bi_openarm_follower", "openarm_follower", "bi_openarms_follower", "openarms_follower"]:
+        return OPENARM_MIRRORING_MASK
+    raise ValueError(f"Unknown robot type: {robot_type}. Add a mirroring mask for this robot.")
+
+
+def swap_left_right_name(name: str) -> str:
+    value = name.replace("left_", "LEFT_PLACEHOLDER_")
+    value = value.replace("right_", "left_")
+    value = value.replace("LEFT_PLACEHOLDER_", "right_")
+    return value
+
+
+def mirror_feature_names(names: list[str]) -> tuple[list[str], dict[int, int]]:
+    mirrored_names = [swap_left_right_name(n) for n in names]
+    old_to_new_idx = {}
+    for old_idx, old_name in enumerate(names):
+        new_name = swap_left_right_name(old_name)
+        new_idx = mirrored_names.index(new_name)
+        old_to_new_idx[old_idx] = new_idx
+    return mirrored_names, old_to_new_idx
+
+
+def _get_axis_names(feature: dict[str, Any]) -> list[str] | None:
+    names = feature.get("names")
+    if isinstance(names, list):
+        return names
+    if isinstance(names, dict):
+        axes = names.get("axes")
+        if isinstance(axes, list):
+            return axes
+    return None
+
+
+def _to_numpy(value: Any) -> Any:
+    if isinstance(value, np.ndarray):
+        return value
+    if hasattr(value, "detach"):
+        return value.detach().cpu().numpy()
+    if hasattr(value, "cpu") and hasattr(value, "numpy"):
+        return value.cpu().numpy()
+    if hasattr(value, "numpy"):
+        return value.numpy()
+    return value
+
+
+def apply_mirroring_mask(value: float, axis_name: str, mirroring_mask: dict[str, int]) -> float:
+    if axis_name.startswith("left_") or axis_name.startswith("right_"):
+        axis_name = axis_name.split("_", 1)[1]
+    joint_name = axis_name.split(".")[0]
+    return value * mirroring_mask.get(joint_name, 1)
+
+
+def mirror_vector_feature(
+    value: Any,
+    feature: dict[str, Any],
+    mirroring_mask: dict[str, int],
+) -> Any:
+    array = _to_numpy(value)
+    if not isinstance(array, np.ndarray) or array.ndim != 1:
+        return array
+
+    names = _get_axis_names(feature)
+    if names is None or len(names) != len(array):
+        return array
+
+    mirrored_names, index_mapping = mirror_feature_names(names)
+    mirrored = np.zeros_like(array)
+    for old_idx, new_idx in index_mapping.items():
+        mirrored[new_idx] = apply_mirroring_mask(array[old_idx], mirrored_names[new_idx], mirroring_mask)
+    return mirrored
+
+
+def flip_horizontal(value: Any, expected_shape: list[int] | tuple[int, ...]) -> Any:
+    array = _to_numpy(value)
+    if not isinstance(array, np.ndarray) or array.ndim != 3:
+        return array
+
+    expected_shape = tuple(expected_shape)
+    if array.shape == expected_shape:
+        return np.flip(array, axis=1).copy()  # HWC
+
+    if len(expected_shape) == 3:
+        c, h, w = expected_shape
+        if array.shape == (c, h, w):
+            return np.flip(array, axis=2).copy()  # CHW
+
+    # Conservative fallback for unexpected layouts.
+    return np.flip(array, axis=-1).copy()
+
+
+def build_mirrored_features(features: dict[str, dict[str, Any]]) -> dict[str, dict[str, Any]]:
+    mirrored = {}
+    for key, feature in features.items():
+        new_key = swap_left_right_name(key)
+        new_feature = copy.deepcopy(feature)
+        names = new_feature.get("names")
+        if isinstance(names, list):
+            new_feature["names"] = [swap_left_right_name(name) for name in names]
+        elif isinstance(names, dict) and isinstance(names.get("axes"), list):
+            new_feature["names"]["axes"] = [swap_left_right_name(name) for name in names["axes"]]
+        mirrored[new_key] = new_feature
+    return mirrored
+
+
+def build_mirrored_frame(
+    item: dict[str, Any],
+    source_features: dict[str, dict[str, Any]],
+    mirroring_mask: dict[str, int],
+) -> dict[str, Any]:
+    frame = {}
+    for key, feature in source_features.items():
+        if key in DEFAULT_FEATURES:
+            continue
+
+        value = item[key]
+        if key in {"action", "observation.state"}:
+            value = mirror_vector_feature(value, feature, mirroring_mask)
+        elif feature["dtype"] in {"video", "image"}:
+            value = flip_horizontal(value, feature["shape"])
+        else:
+            value = _to_numpy(value)
+
+        frame[swap_left_right_name(key)] = value
+
+    frame["task"] = item["task"]
+    if "timestamp" in item:
+        ts = _to_numpy(item["timestamp"])
+        frame["timestamp"] = float(ts.item() if hasattr(ts, "item") else ts)
+    return frame
+
+
+def _resolve_source_root(repo_id: str, root: Path | None) -> Path:
+    source_meta = LeRobotDatasetMetadata(repo_id=repo_id, root=root)
+    return source_meta.root
+
+
+def _get_work_dir(output_repo_id: str, work_dir: Path | None) -> Path:
+    if work_dir is not None:
+        return work_dir
+    safe_name = output_repo_id.replace("/", "__")
+    return HF_LEROBOT_HOME / "_mirror_work" / safe_name
+
+
+def _get_shard_root(work_dir: Path, world_size: int, rank: int) -> Path:
+    return work_dir / "mirrored_shards" / f"world_{world_size}_rank_{rank}"
+
+
+def _is_valid_dataset_root(root: Path) -> bool:
+    return (root / "meta" / "info.json").exists()
+
+
+def mirror_shard(
+    repo_id: str,
+    source_root: Path,
+    mirrored_repo_id: str,
+    shard_root: Path,
+    rank: int,
+    world_size: int,
+    vcodec: str,
+    overwrite: bool,
+) -> None:
+    source_dataset = LeRobotDataset(repo_id=repo_id, root=source_root)
+    selected_episodes = list(range(rank, source_dataset.meta.total_episodes, world_size))
+
+    if len(selected_episodes) == 0:
+        logger.info("Rank %s has no episodes assigned. Skipping.", rank)
+        return
+
+    if shard_root.exists():
+        if overwrite:
+            shutil.rmtree(shard_root)
+        elif _is_valid_dataset_root(shard_root):
+            logger.info("Rank %s shard already exists at %s. Skipping.", rank, shard_root)
+            return
+        else:
+            raise RuntimeError(
+                f"Shard root {shard_root} exists but is not a valid dataset. Use --overwrite to recreate."
+            )
+
+    mirroring_mask = get_mirroring_mask(source_dataset.meta.robot_type)
+    mirrored_features = build_mirrored_features(source_dataset.meta.features)
+
+    shard_repo_name = f"{mirrored_repo_id}_world_{world_size}_rank_{rank}"
+    mirrored_dataset = LeRobotDataset.create(
+        repo_id=shard_repo_name,
+        root=shard_root,
+        fps=source_dataset.meta.fps,
+        features=mirrored_features,
+        robot_type=source_dataset.meta.robot_type,
+        use_videos=len(source_dataset.meta.video_keys) > 0,
+        vcodec=vcodec,
+    )
+    mirrored_dataset.meta.update_chunk_settings(
+        chunks_size=source_dataset.meta.chunks_size,
+        data_files_size_in_mb=source_dataset.meta.data_files_size_in_mb,
+        video_files_size_in_mb=source_dataset.meta.video_files_size_in_mb,
+    )
+
+    logger.info(
+        "Rank %s processing %s episodes into shard %s",
+        rank,
+        len(selected_episodes),
+        shard_root,
+    )
+    for source_ep_idx in selected_episodes:
+        episode = source_dataset.meta.episodes[source_ep_idx]
+        start_idx = int(episode["dataset_from_index"])
+        end_idx = int(episode["dataset_to_index"])
+
+        for frame_idx in range(start_idx, end_idx):
+            item = source_dataset[frame_idx]
+            mirrored_frame = build_mirrored_frame(
+                item=item,
+                source_features=source_dataset.meta.features,
+                mirroring_mask=mirroring_mask,
+            )
+            mirrored_dataset.add_frame(mirrored_frame)
+
+        mirrored_dataset.save_episode()
+
+    mirrored_dataset.finalize()
+
+
+class MirrorDatasetShards(PipelineStep):
+    def __init__(
+        self,
+        repo_id: str,
+        source_root: Path,
+        mirrored_repo_id: str,
+        work_dir: Path,
+        vcodec: str,
+        overwrite: bool,
+    ):
+        super().__init__()
+        self.repo_id = repo_id
+        self.source_root = source_root
+        self.mirrored_repo_id = mirrored_repo_id
+        self.work_dir = work_dir
+        self.vcodec = vcodec
+        self.overwrite = overwrite
+
+    def run(self, data=None, rank: int = 0, world_size: int = 1):
+        init_logging()
+        shard_root = _get_shard_root(self.work_dir, world_size, rank)
+        mirror_shard(
+            repo_id=self.repo_id,
+            source_root=self.source_root,
+            mirrored_repo_id=self.mirrored_repo_id,
+            shard_root=shard_root,
+            rank=rank,
+            world_size=world_size,
+            vcodec=self.vcodec,
+            overwrite=self.overwrite,
+        )
+
+
+def make_mirror_executor(
+    repo_id: str,
+    source_root: Path,
+    mirrored_repo_id: str,
+    work_dir: Path,
+    logs_dir: Path,
+    job_name: str,
+    num_shards: int,
+    workers: int,
+    partition: str,
+    cpus_per_task: int,
+    mem_per_cpu: str,
+    time_limit: str,
+    vcodec: str,
+    overwrite: bool,
+    slurm: bool,
+):
+    kwargs = {
+        "pipeline": [
+            MirrorDatasetShards(
+                repo_id=repo_id,
+                source_root=source_root,
+                mirrored_repo_id=mirrored_repo_id,
+                work_dir=work_dir,
+                vcodec=vcodec,
+                overwrite=overwrite,
+            ),
+        ],
+        "logging_dir": str(logs_dir / job_name),
+    }
+
+    if slurm:
+        if partition is None:
+            raise ValueError("`--partition` is required when `--slurm 1`.")
+        kwargs.update(
+            {
+                "job_name": job_name,
+                "tasks": num_shards,
+                "workers": workers,
+                "time": time_limit,
+                "partition": partition,
+                "cpus_per_task": cpus_per_task,
+                "sbatch_args": {"mem-per-cpu": mem_per_cpu},
+            }
+        )
+        return SlurmPipelineExecutor(**kwargs)
+
+    kwargs.update({"tasks": num_shards, "workers": 1})
+    return LocalPipelineExecutor(**kwargs)
+
+
+class AggregateMirroredShardsStep(PipelineStep):
+    def __init__(
+        self,
+        mirrored_repo_id: str,
+        mirrored_root: Path,
+        work_dir: Path,
+        num_shards: int,
+        overwrite: bool,
+    ):
+        super().__init__()
+        self.mirrored_repo_id = mirrored_repo_id
+        self.mirrored_root = mirrored_root
+        self.work_dir = work_dir
+        self.num_shards = num_shards
+        self.overwrite = overwrite
+
+    def run(self, data=None, rank: int = 0, world_size: int = 1):
+        init_logging()
+        if rank != 0:
+            logger.info("Skipping rank %s for aggregate mirrored step", rank)
+            return
+        aggregate_mirrored_shards(
+            mirrored_repo_id=self.mirrored_repo_id,
+            mirrored_root=self.mirrored_root,
+            work_dir=self.work_dir,
+            num_shards=self.num_shards,
+            overwrite=self.overwrite,
+        )
+
+
+class BuildDoubledDatasetStep(PipelineStep):
+    def __init__(
+        self,
+        source_repo_id: str,
+        source_root: Path,
+        mirrored_repo_id: str,
+        mirrored_root: Path,
+        output_repo_id: str,
+        output_root: Path,
+        overwrite: bool,
+    ):
+        super().__init__()
+        self.source_repo_id = source_repo_id
+        self.source_root = source_root
+        self.mirrored_repo_id = mirrored_repo_id
+        self.mirrored_root = mirrored_root
+        self.output_repo_id = output_repo_id
+        self.output_root = output_root
+        self.overwrite = overwrite
+
+    def run(self, data=None, rank: int = 0, world_size: int = 1):
+        init_logging()
+        if rank != 0:
+            logger.info("Skipping rank %s for build doubled step", rank)
+            return
+        build_doubled_dataset(
+            source_repo_id=self.source_repo_id,
+            source_root=self.source_root,
+            mirrored_repo_id=self.mirrored_repo_id,
+            mirrored_root=self.mirrored_root,
+            output_repo_id=self.output_repo_id,
+            output_root=self.output_root,
+            overwrite=self.overwrite,
+        )
+
+
+class PushDoubledDatasetStep(PipelineStep):
+    def __init__(
+        self,
+        output_repo_id: str,
+        output_root: Path,
+    ):
+        super().__init__()
+        self.output_repo_id = output_repo_id
+        self.output_root = output_root
+
+    def run(self, data=None, rank: int = 0, world_size: int = 1):
+        init_logging()
+        if rank != 0:
+            logger.info("Skipping rank %s for push step", rank)
+            return
+        logger.info("Pushing doubled dataset to hub: %s", self.output_repo_id)
+        LeRobotDataset(self.output_repo_id, root=self.output_root).push_to_hub()
+
+
+def make_single_task_executor(
+    step: PipelineStep,
+    logs_dir: Path,
+    job_name: str,
+    partition: str | None,
+    cpus_per_task: int,
+    mem_per_cpu: str,
+    time_limit: str,
+    slurm: bool,
+    depends: SlurmPipelineExecutor | None = None,
+):
+    kwargs = {"pipeline": [step], "logging_dir": str(logs_dir / job_name)}
+    if slurm:
+        if partition is None:
+            raise ValueError("`--partition` is required when `--slurm 1`.")
+        kwargs.update(
+            {
+                "job_name": job_name,
+                "tasks": 1,
+                "workers": 1,
+                "time": time_limit,
+                "partition": partition,
+                "cpus_per_task": cpus_per_task,
+                "sbatch_args": {"mem-per-cpu": mem_per_cpu},
+                "depends": depends,
+            }
+        )
+        return SlurmPipelineExecutor(**kwargs)
+
+    kwargs.update({"tasks": 1, "workers": 1})
+    return LocalPipelineExecutor(**kwargs)
+
+
+def aggregate_mirrored_shards(
+    mirrored_repo_id: str,
+    mirrored_root: Path,
+    work_dir: Path,
+    num_shards: int,
+    overwrite: bool,
+):
+    if mirrored_root.exists():
+        if overwrite:
+            shutil.rmtree(mirrored_root)
+        elif _is_valid_dataset_root(mirrored_root):
+            logger.info("Mirrored dataset already exists at %s. Skipping aggregation.", mirrored_root)
+            return
+        else:
+            raise RuntimeError(
+                f"Mirrored root {mirrored_root} exists but is not a valid dataset. Use --overwrite to recreate."
+            )
+
+    shard_repo_ids = []
+    shard_roots = []
+    for rank in range(num_shards):
+        shard_root = _get_shard_root(work_dir, num_shards, rank)
+        if _is_valid_dataset_root(shard_root):
+            shard_repo_ids.append(f"{mirrored_repo_id}_world_{num_shards}_rank_{rank}")
+            shard_roots.append(shard_root)
+
+    if len(shard_repo_ids) == 0:
+        raise RuntimeError("No mirrored shards were produced. Nothing to aggregate.")
+
+    logger.info("Aggregating %s mirrored shards into %s", len(shard_repo_ids), mirrored_root)
+    aggregate_datasets(
+        repo_ids=shard_repo_ids,
+        roots=shard_roots,
+        aggr_repo_id=mirrored_repo_id,
+        aggr_root=mirrored_root,
+    )
+
+
+def build_doubled_dataset(
+    source_repo_id: str,
+    source_root: Path,
+    mirrored_repo_id: str,
+    mirrored_root: Path,
+    output_repo_id: str,
+    output_root: Path,
+    overwrite: bool,
+):
+    if output_root.exists():
+        if overwrite:
+            shutil.rmtree(output_root)
+        elif _is_valid_dataset_root(output_root):
+            logger.info("Doubled dataset already exists at %s. Skipping final aggregation.", output_root)
+            return
+        else:
+            raise RuntimeError(
+                f"Output root {output_root} exists but is not a valid dataset. Use --overwrite to recreate."
+            )
+
+    logger.info("Aggregating source + mirrored into doubled dataset at %s", output_root)
+    aggregate_datasets(
+        repo_ids=[source_repo_id, mirrored_repo_id],
+        roots=[source_root, mirrored_root],
+        aggr_repo_id=output_repo_id,
+        aggr_root=output_root,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo-id", type=str, required=True, help="Source dataset repo id.")
+    parser.add_argument("--output-repo-id", type=str, required=True, help="Final doubled dataset repo id.")
+    parser.add_argument("--root", type=Path, default=None, help="Root path of source dataset.")
+    parser.add_argument(
+        "--output-root",
+        type=Path,
+        default=None,
+        help="Root path where final doubled dataset is written.",
+    )
+    parser.add_argument(
+        "--work-dir",
+        type=Path,
+        default=None,
+        help="Intermediate directory for mirrored shards and mirrored aggregate dataset.",
+    )
+    parser.add_argument("--logs-dir", type=Path, required=True, help="DataTrove logs path.")
+    parser.add_argument("--job-name", type=str, default="mirror_dataset", help="SLURM job name.")
+    parser.add_argument("--num-shards", type=int, default=256, help="Number of DataTrove tasks/ranks.")
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=64,
+        help="Max concurrent DataTrove workers on SLURM.",
+    )
+    parser.add_argument("--partition", type=str, default=None, help="SLURM partition (e.g. hopper-cpu).")
+    parser.add_argument("--cpus-per-task", type=int, default=8, help="CPU count per SLURM task.")
+    parser.add_argument("--mem-per-cpu", type=str, default="4G", help="Memory per CPU for SLURM task.")
+    parser.add_argument("--time", type=str, default="24:00:00", help="SLURM time limit.")
+    parser.add_argument("--vcodec", type=str, default="libsvtav1", help="Video codec for output videos.")
+    parser.add_argument(
+        "--slurm",
+        type=int,
+        default=1,
+        help="Use SLURM executor. Set 0 for local sequential debugging.",
+    )
+    parser.add_argument("--overwrite", action="store_true", help="Delete existing intermediate/final outputs.")
+    parser.add_argument(
+        "--push-to-hub",
+        action="store_true",
+        help="Push final doubled dataset to Hugging Face Hub after completion.",
+    )
+    args = parser.parse_args()
+
+    init_logging()
+    slurm = args.slurm == 1
+
+    source_root = _resolve_source_root(args.repo_id, args.root)
+    output_root = args.output_root if args.output_root is not None else HF_LEROBOT_HOME / args.output_repo_id
+    work_dir = _get_work_dir(args.output_repo_id, args.work_dir)
+    mirrored_repo_id = f"{args.output_repo_id}_mirrored"
+    mirrored_root = work_dir / "mirrored_aggregate"
+
+    work_dir.mkdir(parents=True, exist_ok=True)
+    args.logs_dir.mkdir(parents=True, exist_ok=True)
+
+    mirror_executor = make_mirror_executor(
+        repo_id=args.repo_id,
+        source_root=source_root,
+        mirrored_repo_id=mirrored_repo_id,
+        work_dir=work_dir,
+        logs_dir=args.logs_dir,
+        job_name=args.job_name,
+        num_shards=args.num_shards,
+        workers=args.workers,
+        partition=args.partition,
+        cpus_per_task=args.cpus_per_task,
+        mem_per_cpu=args.mem_per_cpu,
+        time_limit=args.time,
+        vcodec=args.vcodec,
+        overwrite=args.overwrite,
+        slurm=slurm,
+    )
+    if slurm:
+        aggregate_executor = make_single_task_executor(
+            step=AggregateMirroredShardsStep(
+                mirrored_repo_id=mirrored_repo_id,
+                mirrored_root=mirrored_root,
+                work_dir=work_dir,
+                num_shards=args.num_shards,
+                overwrite=args.overwrite,
+            ),
+            logs_dir=args.logs_dir,
+            job_name=f"{args.job_name}_aggregate_mirrored",
+            partition=args.partition,
+            cpus_per_task=args.cpus_per_task,
+            mem_per_cpu=args.mem_per_cpu,
+            time_limit=args.time,
+            slurm=True,
+            depends=mirror_executor,
+        )
+        build_executor = make_single_task_executor(
+            step=BuildDoubledDatasetStep(
+                source_repo_id=args.repo_id,
+                source_root=source_root,
+                mirrored_repo_id=mirrored_repo_id,
+                mirrored_root=mirrored_root,
+                output_repo_id=args.output_repo_id,
+                output_root=output_root,
+                overwrite=args.overwrite,
+            ),
+            logs_dir=args.logs_dir,
+            job_name=f"{args.job_name}_build_doubled",
+            partition=args.partition,
+            cpus_per_task=args.cpus_per_task,
+            mem_per_cpu=args.mem_per_cpu,
+            time_limit=args.time,
+            slurm=True,
+            depends=aggregate_executor,
+        )
+
+        final_executor: SlurmPipelineExecutor | LocalPipelineExecutor = build_executor
+        push_executor = None
+        if args.push_to_hub:
+            push_executor = make_single_task_executor(
+                step=PushDoubledDatasetStep(
+                    output_repo_id=args.output_repo_id,
+                    output_root=output_root,
+                ),
+                logs_dir=args.logs_dir,
+                job_name=f"{args.job_name}_push",
+                partition=args.partition,
+                cpus_per_task=args.cpus_per_task,
+                mem_per_cpu=args.mem_per_cpu,
+                time_limit=args.time,
+                slurm=True,
+                depends=build_executor,
+            )
+            final_executor = push_executor
+
+        final_executor.run()
+        logger.info(
+            "Submitted SLURM chain. job_ids: mirror=%s aggregate=%s doubled=%s push=%s",
+            mirror_executor.job_id,
+            aggregate_executor.job_id,
+            build_executor.job_id,
+            push_executor.job_id if push_executor is not None else None,
+        )
+        return
+
+    mirror_executor.run()
+    aggregate_mirrored_shards(
+        mirrored_repo_id=mirrored_repo_id,
+        mirrored_root=mirrored_root,
+        work_dir=work_dir,
+        num_shards=args.num_shards,
+        overwrite=args.overwrite,
+    )
+    build_doubled_dataset(
+        source_repo_id=args.repo_id,
+        source_root=source_root,
+        mirrored_repo_id=mirrored_repo_id,
+        mirrored_root=mirrored_root,
+        output_repo_id=args.output_repo_id,
+        output_root=output_root,
+        overwrite=args.overwrite,
+    )
+    if args.push_to_hub:
+        logger.info("Pushing doubled dataset to hub: %s", args.output_repo_id)
+        LeRobotDataset(args.output_repo_id, root=output_root).push_to_hub()
+
+
+if __name__ == "__main__":
+    main()
@@ -61,7 +61,7 @@ dependencies = [
    # Hugging Face dependencies
    "datasets>=4.0.0,<5.0.0",
    "diffusers>=0.27.2,<0.36.0",
-    "huggingface-hub[cli]>=1.0.0,<2.0.0",
+    "huggingface-hub[hf-transfer,cli]>=0.34.2,<0.36.0",
    "accelerate>=1.10.0,<2.0.0",

    # Core dependencies
@@ -96,15 +96,13 @@ dependencies = [
 # Common
 pygame-dep = ["pygame>=2.5.1,<2.7.0"]
 placo-dep = ["placo>=0.9.6,<0.10.0"]
-transformers-dep = ["transformers>=5.1.0,<6.0.0"]
+transformers-dep = ["transformers>=4.57.1,<5.0.0"]
 grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
-can-dep = ["python-can>=4.2.0,<5.0.0"]

 # Motors
 feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"]
 dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0"]
-damiao = ["lerobot[can-dep]"]
-robstride = ["lerobot[can-dep]"]
+damiao = ["python-can>=4.2.0,<5.0.0"]

 # Robots
 openarms = ["lerobot[damiao]"]
@@ -129,13 +127,13 @@ phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0"]

 # Policies
 wallx = [
-    "lerobot[transformers-dep]",
-    "peft>=0.18.0,<1.0.0",
-    "scipy==1.15.3", # TODO: Relax version
-    "torchdiffeq==0.2.5", # TODO: Relax version
-    "qwen-vl-utils==0.0.11" # TODO: Relax version
+    "transformers==4.49.0",
+    "peft==0.17.1",
+    "scipy==1.15.3",
+    "torchdiffeq==0.2.5",
+    "qwen_vl_utils==0.0.11"
 ]
-pi = ["lerobot[transformers-dep]", "scipy==1.15.3"] # TODO: Relax scipy version
+pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi", "scipy>=1.10.1,<1.15"]
 smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"]
 groot = [
    "lerobot[transformers-dep]",
@@ -148,7 +146,7 @@ groot = [
    "ninja>=1.11.1,<2.0.0",
    "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
 ]
-sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "qwen-vl-utils>=0.0.11,<0.1.0"]
+sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "qwen-vl-utils>=0.0.14,<0.1.0"]
 xvla = ["lerobot[transformers-dep]"]
 hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]

@@ -176,8 +174,8 @@ all = [
    "lerobot[reachy2]",
    "lerobot[kinematics]",
    "lerobot[intelrealsense]",
-    "lerobot[wallx]",
-    "lerobot[pi]",
+    # "lerobot[wallx]",
+    # "lerobot[pi]", TODO(Pepijn): Update pi to transformers v5
    "lerobot[smolvla]",
    # "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
    "lerobot[xvla]",
@@ -394,3 +392,85 @@ ignore_errors = false
 # [[tool.mypy.overrides]]
 # module = "lerobot.scripts.*"
 # ignore_errors = false
+
+[tool.uv]
+# wallx requires transformers==4.49.0 which conflicts with other extras that need >=4.53.0
+conflicts = [
+    [
+        { extra = "wallx" },
+        { extra = "transformers-dep" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "pi" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "smolvla" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "groot" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "xvla" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "sarm" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "hilserl" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "libero" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "peft" },
+    ],
+    [
+        { extra = "wallx" },
+        { extra = "all" },
+    ],
+    # pi uses custom branch which conflicts with transformers-dep
+    [
+        { extra = "pi" },
+        { extra = "transformers-dep" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "smolvla" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "groot" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "xvla" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "sarm" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "hilserl" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "libero" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "peft" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "all" },
+    ],
+]
@@ -68,7 +68,6 @@ from lerobot.datasets.utils import (
    write_tasks,
 )
 from lerobot.datasets.video_utils import (
-    StreamingVideoEncoder,
    VideoFrame,
    concatenate_video_files,
    decode_video_frames,
@@ -76,11 +75,11 @@ from lerobot.datasets.video_utils import (
    get_safe_default_codec,
    get_video_duration_in_s,
    get_video_info,
-    resolve_vcodec,
 )
 from lerobot.utils.constants import HF_LEROBOT_HOME

 CODEBASE_VERSION = "v3.0"
+VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1"}


 class LeRobotDatasetMetadata:
@@ -546,19 +545,12 @@ class LeRobotDatasetMetadata:


 def _encode_video_worker(
-    video_key: str,
-    episode_index: int,
-    root: Path,
-    fps: int,
-    vcodec: str = "libsvtav1",
-    encoder_threads: int | None = None,
+    video_key: str, episode_index: int, root: Path, fps: int, vcodec: str = "libsvtav1"
 ) -> Path:
    temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
    fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
    img_dir = (root / fpath).parent
-    encode_video_frames(
-        img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
-    )
+    encode_video_frames(img_dir, temp_path, fps, vcodec=vcodec, overwrite=True)
    shutil.rmtree(img_dir)
    return temp_path

@@ -578,9 +570,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        video_backend: str | None = None,
        batch_encoding_size: int = 1,
        vcodec: str = "libsvtav1",
-        streaming_encoding: bool = False,
-        encoder_queue_maxsize: int = 30,
-        encoder_threads: int | None = None,
    ):
        """
        2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -694,17 +683,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
            batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
                Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
            vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
-                'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
-                Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
-            streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
-                instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
-            encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
-                streaming encoding. Defaults to 30 (~1s at 30fps).
-            encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
-                codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
-                libsvtav1 and 'threads' for h264/hevc.
+                'libsvtav1'. Defaults to 'libsvtav1'. Use 'h264' for faster encoding on systems where AV1
+                encoding is CPU-heavy.
        """
        super().__init__()
+        if vcodec not in VALID_VIDEO_CODECS:
+            raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
        self.repo_id = repo_id
        self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
        self.image_transforms = image_transforms
@@ -716,8 +700,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.delta_indices = None
        self.batch_encoding_size = batch_encoding_size
        self.episodes_since_last_encoding = 0
-        self.vcodec = resolve_vcodec(vcodec)
-        self._encoder_threads = encoder_threads
+        self.vcodec = vcodec

        # Unused attributes
        self.image_writer = None
@@ -725,7 +708,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.writer = None
        self.latest_episode = None
        self._current_file_start_frame = None  # Track the starting frame index of the current parquet file
-        self._streaming_encoder = None

        self.root.mkdir(exist_ok=True, parents=True)

@@ -767,19 +749,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
            check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s)
            self.delta_indices = get_delta_indices(self.delta_timestamps, self.fps)

-        # Initialize streaming encoder for resumed recording
-        if streaming_encoding and len(self.meta.video_keys) > 0:
-            self._streaming_encoder = StreamingVideoEncoder(
-                fps=self.meta.fps,
-                vcodec=self.vcodec,
-                pix_fmt="yuv420p",
-                g=2,
-                crf=30,
-                preset=None,
-                queue_maxsize=encoder_queue_maxsize,
-                encoder_threads=encoder_threads,
-            )
-
    def _close_writer(self) -> None:
        """Close and cleanup the parquet writer if it exists."""
        writer = getattr(self, "writer", None)
@@ -1135,8 +1104,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        """
        self._close_writer()
        self.meta._close_writer()
-        if self._streaming_encoder is not None:
-            self._streaming_encoder.close()

    def create_episode_buffer(self, episode_index: int | None = None) -> dict:
        current_ep_idx = self.meta.total_episodes if episode_index is None else episode_index
@@ -1191,13 +1158,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.episode_buffer["timestamp"].append(timestamp)
        self.episode_buffer["task"].append(frame.pop("task"))  # Remove task from frame after processing

-        # Start streaming encoder on first frame of episode (once, before iterating keys)
-        if frame_index == 0 and self._streaming_encoder is not None:
-            self._streaming_encoder.start_episode(
-                video_keys=list(self.meta.video_keys),
-                temp_dir=self.root,
-            )
-
        # Add frame features to episode_buffer
        for key in frame:
            if key not in self.features:
@@ -1205,10 +1165,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                    f"An element of the frame is not in the features. '{key}' not in '{self.features.keys()}'."
                )

-            if self.features[key]["dtype"] == "video" and self._streaming_encoder is not None:
-                self._streaming_encoder.feed_frame(key, frame[key])
-                self.episode_buffer[key].append(None)  # Placeholder (video keys are skipped in parquet)
-            elif self.features[key]["dtype"] in ["image", "video"]:
+            if self.features[key]["dtype"] in ["image", "video"]:
                img_path = self._get_image_file_path(
                    episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index
                )
@@ -1269,38 +1226,13 @@ class LeRobotDataset(torch.utils.data.Dataset):

        # Wait for image writer to end, so that episode stats over images can be computed
        self._wait_image_writer()
-
-        has_video_keys = len(self.meta.video_keys) > 0
-        use_streaming = self._streaming_encoder is not None and has_video_keys
-        use_batched_encoding = self.batch_encoding_size > 1
-
-        if use_streaming:
-            # Compute stats for non-video features only (video stats come from encoder)
-            non_video_buffer = {
-                k: v
-                for k, v in episode_buffer.items()
-                if self.features.get(k, {}).get("dtype") not in ("video",)
-            }
-            non_video_features = {k: v for k, v in self.features.items() if v["dtype"] != "video"}
-            ep_stats = compute_episode_stats(non_video_buffer, non_video_features)
-        else:
-            ep_stats = compute_episode_stats(episode_buffer, self.features)
+        ep_stats = compute_episode_stats(episode_buffer, self.features)

        ep_metadata = self._save_episode_data(episode_buffer)
+        has_video_keys = len(self.meta.video_keys) > 0
+        use_batched_encoding = self.batch_encoding_size > 1

-        if use_streaming:
-            # Finish streaming encoding and collect results
-            streaming_results = self._streaming_encoder.finish_episode()
-            for video_key in self.meta.video_keys:
-                temp_path, video_stats = streaming_results[video_key]
-                if video_stats is not None:
-                    # Format stats same as compute_episode_stats: normalize to [0,1], reshape to (C,1,1)
-                    ep_stats[video_key] = {
-                        k: v if k == "count" else np.squeeze(v.reshape(1, -1, 1, 1) / 255.0, axis=0)
-                        for k, v in video_stats.items()
-                    }
-                ep_metadata.update(self._save_episode_video(video_key, episode_index, temp_path=temp_path))
-        elif has_video_keys and not use_batched_encoding:
+        if has_video_keys and not use_batched_encoding:
            num_cameras = len(self.meta.video_keys)
            if parallel_encoding and num_cameras > 1:
                # TODO(Steven): Ideally we would like to control the number of threads per encoding such that:
@@ -1314,7 +1246,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
                            self.root,
                            self.fps,
                            self.vcodec,
-                            self._encoder_threads,
                        ): video_key
                        for video_key in self.meta.video_keys
                    }
@@ -1583,10 +1514,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return metadata

    def clear_episode_buffer(self, delete_images: bool = True) -> None:
-        # Cancel streaming encoder if active
-        if self._streaming_encoder is not None:
-            self._streaming_encoder.cancel_episode()
-
        # Clean up image files for the current episode buffer
        if delete_images:
            # Wait for the async image writer to finish
@@ -1634,9 +1561,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        Note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding,
        since video encoding with ffmpeg is already using multithreading.
        """
-        return _encode_video_worker(
-            video_key, episode_index, self.root, self.fps, self.vcodec, self._encoder_threads
-        )
+        return _encode_video_worker(video_key, episode_index, self.root, self.fps, self.vcodec)

    @classmethod
    def create(
@@ -1653,13 +1578,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
        video_backend: str | None = None,
        batch_encoding_size: int = 1,
        vcodec: str = "libsvtav1",
-        metadata_buffer_size: int = 10,
-        streaming_encoding: bool = False,
-        encoder_queue_maxsize: int = 30,
-        encoder_threads: int | None = None,
    ) -> "LeRobotDataset":
        """Create a LeRobot Dataset from scratch in order to record data."""
-        vcodec = resolve_vcodec(vcodec)
+        if vcodec not in VALID_VIDEO_CODECS:
+            raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
        obj = cls.__new__(cls)
        obj.meta = LeRobotDatasetMetadata.create(
            repo_id=repo_id,
@@ -1668,7 +1590,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
            features=features,
            root=root,
            use_videos=use_videos,
-            metadata_buffer_size=metadata_buffer_size,
        )
        obj.repo_id = obj.meta.repo_id
        obj.root = obj.meta.root
@@ -1678,7 +1599,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.batch_encoding_size = batch_encoding_size
        obj.episodes_since_last_encoding = 0
        obj.vcodec = vcodec
-        obj._encoder_threads = encoder_threads

        if image_writer_processes or image_writer_threads:
            obj.start_image_writer(image_writer_processes, image_writer_threads)
@@ -1700,22 +1620,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj._lazy_loading = False
        obj._recorded_frames = 0
        obj._writer_closed_for_reading = False
-
-        # Initialize streaming encoder
-        if streaming_encoding and len(obj.meta.video_keys) > 0:
-            obj._streaming_encoder = StreamingVideoEncoder(
-                fps=fps,
-                vcodec=vcodec,
-                pix_fmt="yuv420p",
-                g=2,
-                crf=30,
-                preset=None,
-                queue_maxsize=encoder_queue_maxsize,
-                encoder_threads=encoder_threads,
-            )
-        else:
-            obj._streaming_encoder = None
-
        return obj


@@ -13,106 +13,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
 import glob
 import importlib
 import logging
-import queue
 import shutil
 import tempfile
-import threading
 import warnings
 from dataclasses import dataclass, field
-from fractions import Fraction
 from pathlib import Path
 from threading import Lock
 from typing import Any, ClassVar

 import av
 import fsspec
-import numpy as np
 import pyarrow as pa
 import torch
 import torchvision
 from datasets.features.features import register_feature
 from PIL import Image

-# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build.
-# Determines the order of preference for auto-selection when vcodec="auto" is used.
-HW_ENCODERS = [
-    "h264_videotoolbox",  # macOS
-    "hevc_videotoolbox",  # macOS
-    "h264_nvenc",  # NVIDIA GPU
-    "hevc_nvenc",  # NVIDIA GPU
-    "h264_vaapi",  # Linux Intel/AMD
-    "h264_qsv",  # Intel Quick Sync
-]
-
-VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS)
-
-
-def _get_codec_options(
-    vcodec: str,
-    g: int | None = 2,
-    crf: int | None = 30,
-    preset: int | None = None,
-) -> dict:
-    """Build codec-specific options dict for video encoding."""
-    options = {}
-
-    # GOP size (keyframe interval) - supported by VideoToolbox and software encoders
-    if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS):
-        options["g"] = str(g)
-
-    # Quality control (codec-specific parameter names)
-    if crf is not None:
-        if vcodec in ("h264", "hevc", "libsvtav1"):
-            options["crf"] = str(crf)
-        elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
-            quality = max(1, min(100, int(100 - crf * 2)))
-            options["q:v"] = str(quality)
-        elif vcodec in ("h264_nvenc", "hevc_nvenc"):
-            options["rc"] = "constqp"
-            options["qp"] = str(crf)
-        elif vcodec in ("h264_vaapi",):
-            options["qp"] = str(crf)
-        elif vcodec in ("h264_qsv",):
-            options["global_quality"] = str(crf)
-
-    # Preset (only for libsvtav1)
-    if vcodec == "libsvtav1":
-        options["preset"] = str(preset) if preset is not None else "12"
-
-    return options
-
-
-def detect_available_hw_encoders() -> list[str]:
-    """Probe PyAV/FFmpeg for available hardware video encoders."""
-    available = []
-    for codec_name in HW_ENCODERS:
-        try:
-            av.codec.Codec(codec_name, "w")
-            available.append(codec_name)
-        except Exception:  # nosec B110
-            pass  # nosec B110
-    return available
-
-
-def resolve_vcodec(vcodec: str) -> str:
-    """Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1."""
-    if vcodec not in VALID_VIDEO_CODECS:
-        raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
-    if vcodec != "auto":
-        logging.info(f"Using video codec: {vcodec}")
-        return vcodec
-    available = detect_available_hw_encoders()
-    for encoder in HW_ENCODERS:
-        if encoder in available:
-            logging.info(f"Auto-selected video codec: {encoder}")
-            return encoder
-    logging.info("No hardware encoder available, falling back to software encoder 'libsvtav1'")
-    return "libsvtav1"
-

 def get_safe_default_codec():
    if importlib.util.find_spec("torchcodec"):
@@ -390,13 +309,14 @@ def encode_video_frames(
    g: int | None = 2,
    crf: int | None = 30,
    fast_decode: int = 0,
-    log_level: int | None = av.logging.WARNING,
+    log_level: int | None = av.logging.ERROR,
    overwrite: bool = False,
    preset: int | None = None,
-    encoder_threads: int | None = None,
 ) -> None:
    """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
-    vcodec = resolve_vcodec(vcodec)
+    # Check encoder availability
+    if vcodec not in ["h264", "hevc", "libsvtav1"]:
+        raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")

    video_path = Path(video_path)
    imgs_dir = Path(imgs_dir)
@@ -427,22 +347,21 @@ def encode_video_frames(
        width, height = dummy_image.size

    # Define video codec options
-    video_options = _get_codec_options(vcodec, g, crf, preset)
+    video_options = {}
+
+    if g is not None:
+        video_options["g"] = str(g)
+
+    if crf is not None:
+        video_options["crf"] = str(crf)

    if fast_decode:
        key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
        video_options[key] = value

-    if encoder_threads is not None:
-        if vcodec == "libsvtav1":
-            lp_param = f"lp={encoder_threads}"
-            if "svtav1-params" in video_options:
-                video_options["svtav1-params"] += f":{lp_param}"
-            else:
-                video_options["svtav1-params"] = lp_param
-        else:
-            video_options["threads"] = str(encoder_threads)
+    if vcodec == "libsvtav1":
+        video_options["preset"] = str(preset) if preset is not None else "12"

    # Set logging level
    if log_level is not None:
@@ -561,348 +480,6 @@ def concatenate_video_files(
    Path(tmp_concatenate_path).unlink()


-class _CameraEncoderThread(threading.Thread):
-    """A thread that encodes video frames streamed via a queue into an MP4 file.
-
-    One instance is created per camera per episode. Frames are received as numpy arrays
-    from the main thread, encoded in real-time using PyAV (which releases the GIL during
-    encoding), and written to disk. Stats are computed incrementally using
-    RunningQuantileStats and returned via result_queue.
-    """
-
-    def __init__(
-        self,
-        video_path: Path,
-        fps: int,
-        vcodec: str,
-        pix_fmt: str,
-        g: int | None,
-        crf: int | None,
-        preset: int | None,
-        frame_queue: queue.Queue,
-        result_queue: queue.Queue,
-        stop_event: threading.Event,
-        encoder_threads: int | None = None,
-    ):
-        super().__init__(daemon=True)
-        self.video_path = video_path
-        self.fps = fps
-        self.vcodec = vcodec
-        self.pix_fmt = pix_fmt
-        self.g = g
-        self.crf = crf
-        self.preset = preset
-        self.frame_queue = frame_queue
-        self.result_queue = result_queue
-        self.stop_event = stop_event
-        self.encoder_threads = encoder_threads
-
-    def run(self) -> None:
-        from lerobot.datasets.compute_stats import RunningQuantileStats, auto_downsample_height_width
-
-        container = None
-        output_stream = None
-        stats_tracker = RunningQuantileStats()
-        frame_count = 0
-
-        try:
-            logging.getLogger("libav").setLevel(av.logging.WARNING)
-
-            while True:
-                try:
-                    frame_data = self.frame_queue.get(timeout=1)
-                except queue.Empty:
-                    if self.stop_event.is_set():
-                        break
-                    continue
-
-                if frame_data is None:
-                    # Sentinel: flush and close
-                    break
-
-                # Ensure HWC uint8 numpy array
-                if isinstance(frame_data, np.ndarray):
-                    if frame_data.ndim == 3 and frame_data.shape[0] == 3:
-                        # CHW -> HWC
-                        frame_data = frame_data.transpose(1, 2, 0)
-                    if frame_data.dtype != np.uint8:
-                        frame_data = (frame_data * 255).astype(np.uint8)
-
-                # Open container on first frame (to get width/height)
-                if container is None:
-                    height, width = frame_data.shape[:2]
-                    video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset)
-                    if self.encoder_threads is not None:
-                        if self.vcodec == "libsvtav1":
-                            lp_param = f"lp={self.encoder_threads}"
-                            if "svtav1-params" in video_options:
-                                video_options["svtav1-params"] += f":{lp_param}"
-                            else:
-                                video_options["svtav1-params"] = lp_param
-                        else:
-                            video_options["threads"] = str(self.encoder_threads)
-                    Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
-                    container = av.open(str(self.video_path), "w")
-                    output_stream = container.add_stream(self.vcodec, self.fps, options=video_options)
-                    output_stream.pix_fmt = self.pix_fmt
-                    output_stream.width = width
-                    output_stream.height = height
-                    output_stream.time_base = Fraction(1, self.fps)
-
-                # Encode frame with explicit timestamps
-                pil_img = Image.fromarray(frame_data)
-                video_frame = av.VideoFrame.from_image(pil_img)
-                video_frame.pts = frame_count
-                video_frame.time_base = Fraction(1, self.fps)
-                packet = output_stream.encode(video_frame)
-                if packet:
-                    container.mux(packet)
-
-                # Update stats with downsampled frame (per-channel stats like compute_episode_stats)
-                img_chw = frame_data.transpose(2, 0, 1)  # HWC -> CHW
-                img_downsampled = auto_downsample_height_width(img_chw)
-                # Reshape CHW to (H*W, C) for per-channel stats
-                channels = img_downsampled.shape[0]
-                img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
-                stats_tracker.update(img_for_stats)
-
-                frame_count += 1
-
-            # Flush encoder
-            if output_stream is not None:
-                packet = output_stream.encode()
-                if packet:
-                    container.mux(packet)
-
-            if container is not None:
-                container.close()
-
-            av.logging.restore_default_callback()
-
-            # Get stats and put on result queue
-            if frame_count >= 2:
-                stats = stats_tracker.get_statistics()
-                self.result_queue.put(("ok", stats))
-            else:
-                self.result_queue.put(("ok", None))
-
-        except Exception as e:
-            logging.error(f"Encoder thread error: {e}")
-            if container is not None:
-                with contextlib.suppress(Exception):
-                    container.close()
-            self.result_queue.put(("error", str(e)))
-
-
-class StreamingVideoEncoder:
-    """Manages per-camera encoder threads for real-time video encoding during recording.
-
-    Instead of writing frames as PNG images and then encoding to MP4 at episode end,
-    this class streams frames directly to encoder threads, eliminating the
-    PNG round-trip and making save_episode() near-instant.
-
-    Uses threading instead of multiprocessing to avoid the overhead of pickling large
-    numpy arrays through multiprocessing.Queue. PyAV's encode() releases the GIL,
-    so encoding runs in parallel with the main recording loop.
-    """
-
-    def __init__(
-        self,
-        fps: int,
-        vcodec: str = "libsvtav1",
-        pix_fmt: str = "yuv420p",
-        g: int | None = 2,
-        crf: int | None = 30,
-        preset: int | None = None,
-        queue_maxsize: int = 30,
-        encoder_threads: int | None = None,
-    ):
-        self.fps = fps
-        self.vcodec = resolve_vcodec(vcodec)
-        self.pix_fmt = pix_fmt
-        self.g = g
-        self.crf = crf
-        self.preset = preset
-        self.queue_maxsize = queue_maxsize
-        self.encoder_threads = encoder_threads
-
-        self._frame_queues: dict[str, queue.Queue] = {}
-        self._result_queues: dict[str, queue.Queue] = {}
-        self._threads: dict[str, _CameraEncoderThread] = {}
-        self._stop_events: dict[str, threading.Event] = {}
-        self._video_paths: dict[str, Path] = {}
-        self._dropped_frames: dict[str, int] = {}
-        self._episode_active = False
-
-    def start_episode(self, video_keys: list[str], temp_dir: Path) -> None:
-        """Start encoder threads for a new episode.
-
-        Args:
-            video_keys: List of video feature keys (e.g. ["observation.images.laptop"])
-            temp_dir: Base directory for temporary MP4 files
-        """
-        if self._episode_active:
-            self.cancel_episode()
-
-        self._dropped_frames.clear()
-
-        for video_key in video_keys:
-            frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize)
-            result_queue: queue.Queue = queue.Queue(maxsize=1)
-            stop_event = threading.Event()
-
-            temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
-            video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
-
-            encoder_thread = _CameraEncoderThread(
-                video_path=video_path,
-                fps=self.fps,
-                vcodec=self.vcodec,
-                pix_fmt=self.pix_fmt,
-                g=self.g,
-                crf=self.crf,
-                preset=self.preset,
-                frame_queue=frame_queue,
-                result_queue=result_queue,
-                stop_event=stop_event,
-                encoder_threads=self.encoder_threads,
-            )
-            encoder_thread.start()
-
-            self._frame_queues[video_key] = frame_queue
-            self._result_queues[video_key] = result_queue
-            self._threads[video_key] = encoder_thread
-            self._stop_events[video_key] = stop_event
-            self._video_paths[video_key] = video_path
-
-        self._episode_active = True
-
-    def feed_frame(self, video_key: str, image: np.ndarray) -> None:
-        """Feed a frame to the encoder for a specific camera.
-
-        A copy of the image is made before enqueueing to prevent race conditions
-        with camera drivers that may reuse buffers. If the encoder queue is full
-        (encoder can't keep up), the frame is dropped with a warning instead of
-        crashing the recording session.
-
-        Args:
-            video_key: The video feature key
-            image: numpy array in (H,W,C) or (C,H,W) format, uint8 or float
-
-        Raises:
-            RuntimeError: If the encoder thread has crashed
-        """
-        if not self._episode_active:
-            raise RuntimeError("No active episode. Call start_episode() first.")
-
-        thread = self._threads[video_key]
-        if not thread.is_alive():
-            # Check for error
-            try:
-                status, msg = self._result_queues[video_key].get_nowait()
-                if status == "error":
-                    raise RuntimeError(f"Encoder thread for {video_key} crashed: {msg}")
-            except queue.Empty:
-                pass
-            raise RuntimeError(f"Encoder thread for {video_key} is not alive")
-
-        try:
-            self._frame_queues[video_key].put(image.copy(), timeout=0.1)
-        except queue.Full:
-            self._dropped_frames[video_key] = self._dropped_frames.get(video_key, 0) + 1
-            count = self._dropped_frames[video_key]
-            # Log periodically to avoid spam (1st, then every 10th)
-            if count == 1 or count % 10 == 0:
-                logging.warning(
-                    f"Encoder queue full for {video_key}, dropped {count} frame(s). "
-                    f"Consider using vcodec='auto' for hardware encoding or increasing encoder_queue_maxsize."
-                )
-
-    def finish_episode(self) -> dict[str, tuple[Path, dict | None]]:
-        """Finish encoding the current episode.
-
-        Sends sentinel values, waits for encoder threads to complete,
-        and collects results.
-
-        Returns:
-            Dict mapping video_key to (mp4_path, stats_dict_or_None)
-        """
-        if not self._episode_active:
-            raise RuntimeError("No active episode to finish.")
-
-        results = {}
-
-        # Report dropped frames
-        for video_key, count in self._dropped_frames.items():
-            if count > 0:
-                logging.warning(f"Episode finished with {count} dropped frame(s) for {video_key}.")
-
-        # Send sentinel to all queues
-        for video_key in self._frame_queues:
-            self._frame_queues[video_key].put(None)
-
-        # Wait for all threads and collect results
-        for video_key in self._threads:
-            self._threads[video_key].join(timeout=120)
-            if self._threads[video_key].is_alive():
-                logging.error(f"Encoder thread for {video_key} did not finish in time")
-                self._stop_events[video_key].set()
-                self._threads[video_key].join(timeout=5)
-                results[video_key] = (self._video_paths[video_key], None)
-                continue
-
-            try:
-                status, data = self._result_queues[video_key].get(timeout=5)
-                if status == "error":
-                    raise RuntimeError(f"Encoder thread for {video_key} failed: {data}")
-                results[video_key] = (self._video_paths[video_key], data)
-            except queue.Empty:
-                logging.error(f"No result from encoder thread for {video_key}")
-                results[video_key] = (self._video_paths[video_key], None)
-
-        self._cleanup()
-        self._episode_active = False
-        return results
-
-    def cancel_episode(self) -> None:
-        """Cancel the current episode, stopping encoder threads and cleaning up."""
-        if not self._episode_active:
-            return
-
-        # Signal all threads to stop
-        for video_key in self._stop_events:
-            self._stop_events[video_key].set()
-
-        # Wait for threads to finish
-        for video_key in self._threads:
-            self._threads[video_key].join(timeout=5)
-
-            # Clean up temp MP4 files
-            video_path = self._video_paths.get(video_key)
-            if video_path is not None and video_path.exists():
-                shutil.rmtree(str(video_path.parent), ignore_errors=True)
-
-        self._cleanup()
-        self._episode_active = False
-
-    def close(self) -> None:
-        """Close the encoder, canceling any in-progress episode."""
-        if self._episode_active:
-            self.cancel_episode()
-
-    def _cleanup(self) -> None:
-        """Clean up queues and thread tracking dicts."""
-        for q in self._frame_queues.values():
-            with contextlib.suppress(Exception):
-                while not q.empty():
-                    q.get_nowait()
-        self._frame_queues.clear()
-        self._result_queues.clear()
-        self._threads.clear()
-        self._stop_events.clear()
-        self._video_paths.clear()
-
-
@dataclass
 class VideoFrame:
    # TODO(rcadene, lhoestq): move to Hugging Face `datasets` repo
@@ -937,7 +514,7 @@ with warnings.catch_warnings():

 def get_audio_info(video_path: Path | str) -> dict:
    # Set logging level
-    logging.getLogger("libav").setLevel(av.logging.WARNING)
+    logging.getLogger("libav").setLevel(av.logging.ERROR)

    # Getting audio stream information
    audio_info = {}
@@ -969,7 +546,7 @@ def get_audio_info(video_path: Path | str) -> dict:

 def get_video_info(video_path: Path | str) -> dict:
    # Set logging level
-    logging.getLogger("libav").setLevel(av.logging.WARNING)
+    logging.getLogger("libav").setLevel(av.logging.ERROR)

    # Getting video stream information
    video_info = {}
@@ -1055,15 +632,8 @@ class VideoEncodingManager:
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
-        streaming_encoder = getattr(self.dataset, "_streaming_encoder", None)
-
-        if streaming_encoder is not None:
-            # Handle streaming encoder cleanup
-            if exc_type is not None:
-                streaming_encoder.cancel_episode()
-            streaming_encoder.close()
-        elif self.dataset.episodes_since_last_encoding > 0:
-            # Handle any remaining episodes that haven't been batch encoded
+        # Handle any remaining episodes that haven't been batch encoded
+        if self.dataset.episodes_since_last_encoding > 0:
            if exc_type is not None:
                logging.info("Exception occurred. Encoding remaining episodes before exit...")
            else:
@@ -1080,8 +650,8 @@ class VideoEncodingManager:
        # Finalize the dataset to properly close all writers
        self.dataset.finalize()

-        # Clean up episode images if recording was interrupted (only for non-streaming mode)
-        if exc_type is not None and streaming_encoder is None:
+        # Clean up episode images if recording was interrupted
+        if exc_type is not None:
            interrupted_episode_index = self.dataset.num_episodes
            for key in self.dataset.meta.video_keys:
                img_dir = self.dataset._get_image_file_path(
@@ -1095,12 +665,14 @@ class VideoEncodingManager:

        # Clean up any remaining images directory if it's empty
        img_dir = self.dataset.root / "images"
-        if img_dir.exists():
-            png_files = list(img_dir.rglob("*.png"))
-            if len(png_files) == 0:
+        # Check for any remaining PNG files
+        png_files = list(img_dir.rglob("*.png"))
+        if len(png_files) == 0:
+            # Only remove the images directory if no PNG files remain
+            if img_dir.exists():
                shutil.rmtree(img_dir)
                logging.debug("Cleaned up empty images directory")
-            else:
-                logging.debug(f"Images directory is not empty, containing {len(png_files)} PNG files")
+        else:
+            logging.debug(f"Images directory is not empty, containing {len(png_files)} PNG files")

        return False  # Don't suppress the original exception
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .robstride import RobstrideMotorsBus
-from .tables import *
@@ -1,120 +0,0 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Configuration tables for Damiao motors."""
-
-from enum import IntEnum
-
-
-# Motor type definitions
-class MotorType(IntEnum):
-    O0 = 0
-    O1 = 1
-    O2 = 2
-    O3 = 3
-    O4 = 4
-    O5 = 5
-    ELO5 = 6
-    O6 = 7
-
-
-class CommMode(IntEnum):
-    PrivateProtocole = 0
-    CANopen = 1
-    MIT = 2
-
-
-# Control modes
-class ControlMode(IntEnum):
-    MIT = 0
-    POS_VEL = 1
-    VEL = 2
-
-
-# Motor limit parameters [PMAX, VMAX, TMAX]
-# PMAX: Maximum position (rad)
-# VMAX: Maximum velocity (rad/s)
-# TMAX: Maximum torque (N·m)
-MOTOR_LIMIT_PARAMS: dict[MotorType, tuple[float, float, float]] = {
-    MotorType.O0: (12.57, 33, 14),
-    MotorType.O1: (12.57, 44, 17),
-    MotorType.O2: (12.57, 33, 20),
-    MotorType.O3: (12.57, 33, 60),
-    MotorType.O4: (12.57, 33, 120),
-    MotorType.O5: (12.57, 50, 5.5),
-    MotorType.ELO5: (12.57, 50, 6),
-    MotorType.O6: (112.5, 50, 36),
-}
-
-# Motor model names
-MODEL_NAMES = {
-    MotorType.O0: "O0",
-    MotorType.O1: "O1",
-    MotorType.O2: "O2",
-    MotorType.O3: "O3",
-    MotorType.O4: "O4",
-    MotorType.O5: "O5",
-    MotorType.ELO5: "ELO5",
-    MotorType.O6: "O6",
-}
-
-# Motor resolution table (encoder counts per revolution)
-MODEL_RESOLUTION = {
-    "O0": 65536,
-    "O1": 65536,
-    "O2": 65536,
-    "O3": 65536,
-    "O4": 65536,
-    "O5": 65536,
-    "ELO5": 65536,
-    "O6": 65536,
-}
-
-# CAN baudrates supported by Robstride motors
-AVAILABLE_BAUDRATES = [
-    1000000,  # 4: 1 mbps (default)
-]
-DEFAULT_BAUDRATE = 1000000
-
-# Default timeout in milliseconds
-DEFAULT_TIMEOUT_MS = 0  # disabled by default, otherwise 20000 is 1s
-
-
-# Data that should be normalized
-NORMALIZED_DATA = ["Present_Position", "Goal_Position"]
-
-
-# MIT control parameter ranges
-MIT_KP_RANGE = (0.0, 500.0)
-MIT_KD_RANGE = (0.0, 5.0)
-
-# CAN frame command IDs
-CAN_CMD_ENABLE = 0xFC
-CAN_CMD_DISABLE = 0xFD
-CAN_CMD_SET_ZERO = 0xFE
-CAN_CMD_CLEAR_FAULT = 0xFB
-
-
-CAN_CMD_QUERY_PARAM = 0x33
-CAN_CMD_WRITE_PARAM = 0x55
-CAN_CMD_SAVE_PARAM = 0xAA
-
-# CAN ID for parameter operations
-CAN_PARAM_ID = 0x7FF
-
-
-RUNNING_TIMEOUT = 0.001
-PARAM_TIMEOUT = 0.01
-
-STATE_CACHE_TTL_S = 0.02
@@ -14,7 +14,7 @@ from transformers.image_processing_utils import (
 )
 from transformers.image_processing_utils_fast import (
    BaseImageProcessorFast,
-    ImagesKwargs,
+    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@@ -77,7 +77,7 @@ def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> tor
    return img[:, top:bottom, left:right]


-class Eagle25VLFastImageProcessorKwargs(ImagesKwargs):
+class Eagle25VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
    max_dynamic_tiles: int | None
    min_dynamic_tiles: int | None
    use_thumbnail: bool | None
@@ -15,7 +15,6 @@
 # limitations under the License.

 import builtins
-import copy
 import logging
 import math
 from collections import deque
@@ -33,21 +32,13 @@ from lerobot.utils.import_utils import _transformers_available
 if TYPE_CHECKING or _transformers_available:
    from transformers.models.auto import CONFIG_MAPPING
    from transformers.models.gemma import modeling_gemma
-
-    from lerobot.policies.pi_gemma import (
-        PaliGemmaForConditionalGenerationWithPiGemma,
-        PiGemmaForCausalLM,
-        _gated_residual,
-        layernorm_forward,
-    )
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
 else:
    CONFIG_MAPPING = None
    modeling_gemma = None
-    PiGemmaForCausalLM = None
-    _gated_residual = None
-    layernorm_forward = None
-    PaliGemmaForConditionalGenerationWithPiGemma = None
-
+    GemmaForCausalLM = None
+    PaliGemmaForConditionalGeneration = None

 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.policies.pi0.configuration_pi0 import DEFAULT_IMAGE_SIZE, PI0Config
@@ -200,7 +191,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    if images.dtype == torch.uint8:
        resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8)
    elif images.dtype == torch.float32:
-        resized_images = resized_images.clamp(0.0, 1.0)
+        resized_images = resized_images.clamp(-1.0, 1.0)
    else:
        raise ValueError(f"Unsupported image dtype: {images.dtype}")

@@ -211,7 +202,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    pad_w1 = pad_w0 + remainder_w

    # Pad
-    constant_value = 0 if images.dtype == torch.uint8 else 0.0
+    constant_value = 0 if images.dtype == torch.uint8 else -1.0
    padded_images = F.pad(
        resized_images,
        (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
@@ -230,14 +221,14 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
 def compute_layer_complete(
    layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond, paligemma, gemma_expert
 ):
-    models = [paligemma.model.language_model, gemma_expert.model]
+    models = [paligemma.language_model, gemma_expert.model]
    query_states = []
    key_states = []
    value_states = []
    gates = []
    for i, hidden_states in enumerate(inputs_embeds):
        layer = models[i].layers[layer_idx]
-        hidden_states, gate = layernorm_forward(layer.input_layernorm, hidden_states, adarms_cond[i])
+        hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])  # noqa: PLW2901
        gates.append(gate)
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
@@ -263,10 +254,10 @@ def compute_layer_complete(
        query_states, key_states, cos, sin, unsqueeze_dim=1
    )
    batch_size = query_states.shape[0]
-    scaling = paligemma.model.language_model.layers[layer_idx].self_attn.scaling
+    scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
    # Attention computation
    att_output, _ = modeling_gemma.eager_attention_forward(
-        paligemma.model.language_model.layers[layer_idx].self_attn,
+        paligemma.language_model.layers[layer_idx].self_attn,
        query_states,
        key_states,
        value_states,
@@ -274,7 +265,7 @@ def compute_layer_complete(
        scaling,
    )
    # Get head_dim from the current layer, not from the model
-    head_dim = paligemma.model.language_model.layers[layer_idx].self_attn.head_dim
+    head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
    att_output = att_output.reshape(batch_size, -1, 1 * 8 * head_dim)
    # Process layer outputs
    outputs_embeds = []
@@ -286,15 +277,15 @@ def compute_layer_complete(
            att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
        out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
        # first residual
-        out_emb = _gated_residual(hidden_states, out_emb, gates[i])
+        out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
        after_first_residual = out_emb.clone()
-        out_emb, gate = layernorm_forward(layer.post_attention_layernorm, out_emb, adarms_cond[i])
+        out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
        # Convert to bfloat16 if the next layer (mlp) uses bfloat16
        if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
            out_emb = out_emb.to(dtype=torch.bfloat16)
        out_emb = layer.mlp(out_emb)
        # second residual
-        out_emb = _gated_residual(after_first_residual, out_emb, gate)
+        out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
        outputs_embeds.append(out_emb)
        start_pos = end_pos
    return outputs_embeds
@@ -367,7 +358,7 @@ class PaliGemmaWithExpertModel(
        vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth
        vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads
        vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh"
-        vlm_config_hf.text_config.dtype = "float32"
+        vlm_config_hf.text_config.torch_dtype = "float32"
        vlm_config_hf.text_config.vocab_size = 257152
        vlm_config_hf.text_config.use_adarms = use_adarms[0]
        vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
@@ -375,7 +366,7 @@ class PaliGemmaWithExpertModel(
        vlm_config_hf.vision_config.intermediate_size = 4304
        vlm_config_hf.vision_config.projection_dim = 2048
        vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
-        vlm_config_hf.vision_config.dtype = "float32"
+        vlm_config_hf.vision_config.torch_dtype = "float32"

        action_expert_config_hf = CONFIG_MAPPING["gemma"](
            head_dim=action_expert_config.head_dim,
@@ -386,13 +377,13 @@ class PaliGemmaWithExpertModel(
            num_key_value_heads=action_expert_config.num_kv_heads,
            vocab_size=257152,
            hidden_activation="gelu_pytorch_tanh",
-            dtype="float32",
+            torch_dtype="float32",
            use_adarms=use_adarms[1],
            adarms_cond_dim=action_expert_config.width if use_adarms[1] else None,
        )

-        self.paligemma = PaliGemmaForConditionalGenerationWithPiGemma(config=vlm_config_hf)
-        self.gemma_expert = PiGemmaForCausalLM(config=action_expert_config_hf)
+        self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
+        self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
        self.gemma_expert.model.embed_tokens = None

        self.to_bfloat16_for_selected_params(precision)
@@ -407,11 +398,10 @@ class PaliGemmaWithExpertModel(
        else:
            raise ValueError(f"Invalid precision: {precision}")

-        # Keep full vision path in float32 so we never toggle (toggle causes optimizer
-        # "same dtype" error). Align with PI05.
        params_to_keep_float32 = [
-            "vision_tower",
-            "multi_modal_projector",
+            "vision_tower.vision_model.embeddings.patch_embedding.weight",
+            "vision_tower.vision_model.embeddings.patch_embedding.bias",
+            "vision_tower.vision_model.embeddings.position_embedding.weight",
            "input_layernorm",
            "post_attention_layernorm",
            "model.norm",
@@ -423,8 +413,8 @@ class PaliGemmaWithExpertModel(

    def _set_requires_grad(self):
        if self.freeze_vision_encoder:
-            self.paligemma.model.vision_tower.eval()
-            for param in self.paligemma.model.vision_tower.parameters():
+            self.paligemma.vision_tower.eval()
+            for param in self.paligemma.vision_tower.parameters():
                param.requires_grad = False
        if self.train_expert_only:
            self.paligemma.eval()
@@ -434,23 +424,15 @@ class PaliGemmaWithExpertModel(
    def train(self, mode: bool = True):
        super().train(mode)
        if self.freeze_vision_encoder:
-            self.paligemma.model.vision_tower.eval()
+            self.paligemma.vision_tower.eval()
        if self.train_expert_only:
            self.paligemma.eval()

    def embed_image(self, image: torch.Tensor):
-        # Vision tower and multi_modal_projector are kept in float32 (params_to_keep_float32). Align with PI05.
-        out_dtype = image.dtype
-        if image.dtype != torch.float32:
-            image = image.to(torch.float32)
-        image_outputs = self.paligemma.model.get_image_features(image)
-        features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
-        if features.dtype != out_dtype:
-            features = features.to(out_dtype)
-        return features
+        return self.paligemma.model.get_image_features(image)

    def embed_language_tokens(self, tokens: torch.Tensor):
-        return self.paligemma.model.language_model.embed_tokens(tokens)
+        return self.paligemma.language_model.embed_tokens(tokens)

    def forward(
        self,
@@ -464,7 +446,7 @@ class PaliGemmaWithExpertModel(
        if adarms_cond is None:
            adarms_cond = [None, None]
        if inputs_embeds[1] is None:
-            prefix_output = self.paligemma.model.language_model.forward(
+            prefix_output = self.paligemma.language_model.forward(
                inputs_embeds=inputs_embeds[0],
                attention_mask=attention_mask,
                position_ids=position_ids,
@@ -488,7 +470,7 @@ class PaliGemmaWithExpertModel(
            prefix_output = None
            prefix_past_key_values = None
        else:
-            models = [self.paligemma.model.language_model, self.gemma_expert.model]
+            models = [self.paligemma.language_model, self.gemma_expert.model]
            num_layers = self.paligemma.config.text_config.num_hidden_layers

            # Check if gradient checkpointing is enabled for any of the models
@@ -528,7 +510,7 @@ class PaliGemmaWithExpertModel(
            def compute_final_norms(inputs_embeds, adarms_cond):
                outputs_embeds = []
                for i, hidden_states in enumerate(inputs_embeds):
-                    out_emb, _ = layernorm_forward(models[i].norm, hidden_states, adarms_cond[i])
+                    out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
                    outputs_embeds.append(out_emb)
                return outputs_embeds

@@ -594,19 +576,29 @@ class PI0Pytorch(nn.Module):  # see openpi `PI0Pytorch`
            # Also compile the main forward pass used during training
            self.forward = torch.compile(self.forward, mode=config.compile_mode)

+        msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues"""
+
+        try:
+            from transformers.models.siglip import check
+
+            if not check.check_whether_transformers_replace_is_installed_correctly():
+                raise ValueError(msg)
+        except ImportError:
+            raise ValueError(msg) from None
+
    def gradient_checkpointing_enable(self):
        """Enable gradient checkpointing for memory optimization."""
        self.gradient_checkpointing_enabled = True
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = True
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = True
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
        logging.info("Enabled gradient checkpointing for PI0Pytorch model")

    def gradient_checkpointing_disable(self):
        """Disable gradient checkpointing."""
        self.gradient_checkpointing_enabled = False
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = False
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = False
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
        logging.info("Disabled gradient checkpointing for PI0Pytorch model")

@@ -768,7 +760,7 @@ class PI0Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)

        if (
-            self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
            == torch.bfloat16
        ):
            suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
@@ -842,7 +834,7 @@ class PI0Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1

        prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
-        self.paligemma_with_expert.paligemma.model.language_model.config._attn_implementation = "eager"  # noqa: SLF001
+        self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001

        _, past_key_values = self.paligemma_with_expert.forward(
            attention_mask=prefix_att_2d_masks_4d,
@@ -916,7 +908,6 @@ class PI0Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
        self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001

-        past_key_values = copy.deepcopy(past_key_values)
        outputs_embeds, _ = self.paligemma_with_expert.forward(
            attention_mask=full_att_2d_masks_4d,
            position_ids=position_ids,
@@ -1006,12 +997,14 @@ class PI0Policy(PreTrainedPolicy):
        # Check if dataset_stats were provided in kwargs
        model = cls(config, **kwargs)

-        # Load state dict (expects keys with "model." prefix)
+        # Now manually load and remap the state dict
        try:
+            # Try to load the pytorch_model.bin or model.safetensors file
            print(f"Loading model from: {pretrained_name_or_path}")
            try:
                from transformers.utils import cached_file

+                # Try safetensors first
                resolved_file = cached_file(
                    pretrained_name_or_path,
                    "model.safetensors",
@@ -1019,7 +1012,7 @@ class PI0Policy(PreTrainedPolicy):
                    force_download=kwargs.get("force_download", False),
                    resume_download=kwargs.get("resume_download"),
                    proxies=kwargs.get("proxies"),
-                    token=kwargs.get("token"),
+                    use_auth_token=kwargs.get("use_auth_token"),
                    revision=kwargs.get("revision"),
                    local_files_only=kwargs.get("local_files_only", False),
                )
@@ -1032,7 +1025,7 @@ class PI0Policy(PreTrainedPolicy):
                print("Returning model without loading pretrained weights")
                return model

-            # First, fix any key differences (see openpi model.py, _fix_pytorch_state_dict_keys)
+            # First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys`
            fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config)

            # Then add "model." prefix for all keys that don't already have it
@@ -1077,7 +1070,7 @@ class PI0Policy(PreTrainedPolicy):
                print("All keys loaded successfully!")

        except Exception as e:
-            print(f"Warning: Could not load state dict: {e}")
+            print(f"Warning: Could not remap state dict keys: {e}")

        return model

@@ -1127,14 +1120,6 @@ class PI0Policy(PreTrainedPolicy):
                # Some checkpoints might have this, but current model expects different structure
                logging.warning(f"Vision embedding key might need handling: {key}")

-            if (
-                key == "model.paligemma_with_expert.paligemma.lm_head.weight"
-                or key == "paligemma_with_expert.paligemma.lm_head.weight"
-            ):
-                fixed_state_dict[
-                    "model.paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
-                ] = value.clone()
-
            fixed_state_dict[new_key] = value

        return fixed_state_dict
@@ -15,7 +15,6 @@
 # limitations under the License.

 import builtins
-import copy
 import logging
 import math
 from collections import deque
@@ -33,20 +32,14 @@ from lerobot.utils.import_utils import _transformers_available
 if TYPE_CHECKING or _transformers_available:
    from transformers.models.auto import CONFIG_MAPPING
    from transformers.models.gemma import modeling_gemma
-
-    from lerobot.policies.pi_gemma import (
-        PaliGemmaForConditionalGenerationWithPiGemma,
-        PiGemmaForCausalLM,
-        _gated_residual,
-        layernorm_forward,
-    )
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
 else:
    CONFIG_MAPPING = None
    modeling_gemma = None
-    PiGemmaForCausalLM = None
-    _gated_residual = None
-    layernorm_forward = None
-    PaliGemmaForConditionalGenerationWithPiGemma = None
+    GemmaForCausalLM = None
+    PaliGemmaForConditionalGeneration = None
+
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.policies.pi05.configuration_pi05 import DEFAULT_IMAGE_SIZE, PI05Config
 from lerobot.policies.pretrained import PreTrainedPolicy, T
@@ -196,7 +189,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    if images.dtype == torch.uint8:
        resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8)
    elif images.dtype == torch.float32:
-        resized_images = resized_images.clamp(0.0, 1.0)
+        resized_images = resized_images.clamp(-1.0, 1.0)
    else:
        raise ValueError(f"Unsupported image dtype: {images.dtype}")

@@ -207,7 +200,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    pad_w1 = pad_w0 + remainder_w

    # Pad
-    constant_value = 0 if images.dtype == torch.uint8 else 0.0
+    constant_value = 0 if images.dtype == torch.uint8 else -1.0
    padded_images = F.pad(
        resized_images,
        (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
@@ -226,14 +219,14 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
 def compute_layer_complete(
    layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond, paligemma, gemma_expert
 ):
-    models = [paligemma.model.language_model, gemma_expert.model]
+    models = [paligemma.language_model, gemma_expert.model]
    query_states = []
    key_states = []
    value_states = []
    gates = []
    for i, hidden_states in enumerate(inputs_embeds):
        layer = models[i].layers[layer_idx]
-        hidden_states, gate = layernorm_forward(layer.input_layernorm, hidden_states, adarms_cond[i])
+        hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])  # noqa: PLW2901
        gates.append(gate)
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
@@ -259,10 +252,10 @@ def compute_layer_complete(
        query_states, key_states, cos, sin, unsqueeze_dim=1
    )
    batch_size = query_states.shape[0]
-    scaling = paligemma.model.language_model.layers[layer_idx].self_attn.scaling
+    scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
    # Attention computation
    att_output, _ = modeling_gemma.eager_attention_forward(
-        paligemma.model.language_model.layers[layer_idx].self_attn,
+        paligemma.language_model.layers[layer_idx].self_attn,
        query_states,
        key_states,
        value_states,
@@ -270,7 +263,7 @@ def compute_layer_complete(
        scaling,
    )
    # Get head_dim from the current layer, not from the model
-    head_dim = paligemma.model.language_model.layers[layer_idx].self_attn.head_dim
+    head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
    att_output = att_output.reshape(batch_size, -1, 1 * 8 * head_dim)
    # Process layer outputs
    outputs_embeds = []
@@ -282,15 +275,15 @@ def compute_layer_complete(
            att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
        out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
        # first residual
-        out_emb = _gated_residual(hidden_states, out_emb, gates[i])
+        out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
        after_first_residual = out_emb.clone()
-        out_emb, gate = layernorm_forward(layer.post_attention_layernorm, out_emb, adarms_cond[i])
+        out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
        # Convert to bfloat16 if the next layer (mlp) uses bfloat16
        if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
            out_emb = out_emb.to(dtype=torch.bfloat16)
        out_emb = layer.mlp(out_emb)
        # second residual
-        out_emb = _gated_residual(after_first_residual, out_emb, gate)
+        out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
        outputs_embeds.append(out_emb)
        start_pos = end_pos
    return outputs_embeds
@@ -363,7 +356,7 @@ class PaliGemmaWithExpertModel(
        vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth
        vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads
        vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh"
-        vlm_config_hf.text_config.dtype = "float32"
+        vlm_config_hf.text_config.torch_dtype = "float32"
        vlm_config_hf.text_config.vocab_size = 257152
        vlm_config_hf.text_config.use_adarms = use_adarms[0]
        vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
@@ -371,7 +364,7 @@ class PaliGemmaWithExpertModel(
        vlm_config_hf.vision_config.intermediate_size = 4304
        vlm_config_hf.vision_config.projection_dim = 2048
        vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
-        vlm_config_hf.vision_config.dtype = "float32"
+        vlm_config_hf.vision_config.torch_dtype = "float32"

        action_expert_config_hf = CONFIG_MAPPING["gemma"](
            head_dim=action_expert_config.head_dim,
@@ -382,13 +375,13 @@ class PaliGemmaWithExpertModel(
            num_key_value_heads=action_expert_config.num_kv_heads,
            vocab_size=257152,
            hidden_activation="gelu_pytorch_tanh",
-            dtype="float32",
+            torch_dtype="float32",
            use_adarms=use_adarms[1],
            adarms_cond_dim=action_expert_config.width if use_adarms[1] else None,
        )

-        self.paligemma = PaliGemmaForConditionalGenerationWithPiGemma(config=vlm_config_hf)
-        self.gemma_expert = PiGemmaForCausalLM(config=action_expert_config_hf)
+        self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
+        self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
        self.gemma_expert.model.embed_tokens = None

        self.to_bfloat16_for_selected_params(precision)
@@ -403,11 +396,10 @@ class PaliGemmaWithExpertModel(
        else:
            raise ValueError(f"Invalid precision: {precision}")

-        # Keep full vision path in float32 so we never toggle (toggle causes optimizer
-        # "same dtype" error). Saves memory vs full float32; more memory than only 3 params.
        params_to_keep_float32 = [
-            "vision_tower",
-            "multi_modal_projector",
+            "vision_tower.vision_model.embeddings.patch_embedding.weight",
+            "vision_tower.vision_model.embeddings.patch_embedding.bias",
+            "vision_tower.vision_model.embeddings.position_embedding.weight",
            "input_layernorm",
            "post_attention_layernorm",
            "model.norm",
@@ -419,8 +411,8 @@ class PaliGemmaWithExpertModel(

    def _set_requires_grad(self):
        if self.freeze_vision_encoder:
-            self.paligemma.model.vision_tower.eval()
-            for param in self.paligemma.model.vision_tower.parameters():
+            self.paligemma.vision_tower.eval()
+            for param in self.paligemma.vision_tower.parameters():
                param.requires_grad = False
        if self.train_expert_only:
            self.paligemma.eval()
@@ -430,23 +422,15 @@ class PaliGemmaWithExpertModel(
    def train(self, mode: bool = True):
        super().train(mode)
        if self.freeze_vision_encoder:
-            self.paligemma.model.vision_tower.eval()
+            self.paligemma.vision_tower.eval()
        if self.train_expert_only:
            self.paligemma.eval()

    def embed_image(self, image: torch.Tensor):
-        # Vision tower and multi_modal_projector are kept in float32 (params_to_keep_float32).
-        out_dtype = image.dtype
-        if image.dtype != torch.float32:
-            image = image.to(torch.float32)
-        image_outputs = self.paligemma.model.get_image_features(image)
-        features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
-        if features.dtype != out_dtype:
-            features = features.to(out_dtype)
-        return features
+        return self.paligemma.model.get_image_features(image)

    def embed_language_tokens(self, tokens: torch.Tensor):
-        return self.paligemma.model.language_model.embed_tokens(tokens)
+        return self.paligemma.language_model.embed_tokens(tokens)

    def forward(
        self,
@@ -460,7 +444,7 @@ class PaliGemmaWithExpertModel(
        if adarms_cond is None:
            adarms_cond = [None, None]
        if inputs_embeds[1] is None:
-            prefix_output = self.paligemma.model.language_model.forward(
+            prefix_output = self.paligemma.language_model.forward(
                inputs_embeds=inputs_embeds[0],
                attention_mask=attention_mask,
                position_ids=position_ids,
@@ -484,7 +468,7 @@ class PaliGemmaWithExpertModel(
            prefix_output = None
            prefix_past_key_values = None
        else:
-            models = [self.paligemma.model.language_model, self.gemma_expert.model]
+            models = [self.paligemma.language_model, self.gemma_expert.model]
            num_layers = self.paligemma.config.text_config.num_hidden_layers

            # Check if gradient checkpointing is enabled for any of the models
@@ -524,7 +508,7 @@ class PaliGemmaWithExpertModel(
            def compute_final_norms(inputs_embeds, adarms_cond):
                outputs_embeds = []
                for i, hidden_states in enumerate(inputs_embeds):
-                    out_emb, _ = layernorm_forward(models[i].norm, hidden_states, adarms_cond[i])
+                    out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
                    outputs_embeds.append(out_emb)
                return outputs_embeds

@@ -589,19 +573,29 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
            # Also compile the main forward pass used during training
            self.forward = torch.compile(self.forward, mode=config.compile_mode)

+        msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues"""
+
+        try:
+            from transformers.models.siglip import check
+
+            if not check.check_whether_transformers_replace_is_installed_correctly():
+                raise ValueError(msg)
+        except ImportError:
+            raise ValueError(msg) from None
+
    def gradient_checkpointing_enable(self):
        """Enable gradient checkpointing for memory optimization."""
        self.gradient_checkpointing_enabled = True
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = True
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = True
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
        logging.info("Enabled gradient checkpointing for PI05Pytorch model")

    def gradient_checkpointing_disable(self):
        """Disable gradient checkpointing."""
        self.gradient_checkpointing_enabled = False
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = False
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = False
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
        logging.info("Disabled gradient checkpointing for PI05Pytorch model")

@@ -743,7 +737,7 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(x_t, time)

        if (
-            self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
            == torch.bfloat16
        ):
            suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
@@ -814,7 +808,7 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1

        prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
-        self.paligemma_with_expert.paligemma.model.language_model.config._attn_implementation = "eager"  # noqa: SLF001
+        self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001

        _, past_key_values = self.paligemma_with_expert.forward(
            attention_mask=prefix_att_2d_masks_4d,
@@ -886,7 +880,6 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
        self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001

-        past_key_values = copy.deepcopy(past_key_values)
        outputs_embeds, _ = self.paligemma_with_expert.forward(
            attention_mask=full_att_2d_masks_4d,
            position_ids=position_ids,
@@ -976,12 +969,14 @@ class PI05Policy(PreTrainedPolicy):
        # Check if dataset_stats were provided in kwargs
        model = cls(config, **kwargs)

-        # Load state dict (expects keys with "model." prefix)
+        # Now manually load and remap the state dict
        try:
+            # Try to load the pytorch_model.bin or model.safetensors file
            print(f"Loading model from: {pretrained_name_or_path}")
            try:
                from transformers.utils import cached_file

+                # Try safetensors first
                resolved_file = cached_file(
                    pretrained_name_or_path,
                    "model.safetensors",
@@ -989,7 +984,7 @@ class PI05Policy(PreTrainedPolicy):
                    force_download=kwargs.get("force_download", False),
                    resume_download=kwargs.get("resume_download"),
                    proxies=kwargs.get("proxies"),
-                    token=kwargs.get("token"),
+                    use_auth_token=kwargs.get("use_auth_token"),
                    revision=kwargs.get("revision"),
                    local_files_only=kwargs.get("local_files_only", False),
                )
@@ -1002,7 +997,7 @@ class PI05Policy(PreTrainedPolicy):
                print("Returning model without loading pretrained weights")
                return model

-            # First, fix any key differences (see openpi model.py, _fix_pytorch_state_dict_keys)
+            # First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys`
            fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config)

            # Then add "model." prefix for all keys that don't already have it
@@ -1014,6 +1009,8 @@ class PI05Policy(PreTrainedPolicy):
                    new_key = f"model.{key}"
                    remapped_state_dict[new_key] = value
                    remap_count += 1
+                    if remap_count <= 10:  # Only print first 10 to avoid spam
+                        print(f"Remapped: {key} -> {new_key}")
                else:
                    remapped_state_dict[key] = value

@@ -1047,7 +1044,7 @@ class PI05Policy(PreTrainedPolicy):
                print("All keys loaded successfully!")

        except Exception as e:
-            print(f"Warning: Could not load state dict: {e}")
+            print(f"Warning: Could not remap state dict keys: {e}")

        return model

@@ -1101,14 +1098,6 @@ class PI05Policy(PreTrainedPolicy):
                # Some checkpoints might have this, but current model expects different structure
                logging.warning(f"Vision embedding key might need handling: {key}")

-            if (
-                key == "model.paligemma_with_expert.paligemma.lm_head.weight"
-                or key == "paligemma_with_expert.paligemma.lm_head.weight"
-            ):
-                fixed_state_dict[
-                    "model.paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
-                ] = value.clone()
-
            fixed_state_dict[new_key] = value

        return fixed_state_dict
@@ -23,6 +23,7 @@ import torch

 from lerobot.configs.types import PipelineFeatureType, PolicyFeature
 from lerobot.policies.pi05.configuration_pi05 import PI05Config
+from lerobot.policies.pi05.modeling_pi05 import pad_vector
 from lerobot.processor import (
    AddBatchDimensionProcessorStep,
    DeviceProcessorStep,
@@ -67,6 +68,9 @@ class Pi05PrepareStateTokenizerProcessorStep(ProcessorStep):
        # TODO: check if this necessary
        state = deepcopy(state)

+        # Prepare state (pad to max_state_dim)
+        state = pad_vector(state, self.max_state_dim)
+
        # State should already be normalized to [-1, 1] by the NormalizerProcessorStep that runs before this step
        # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`)
        state_np = state.cpu().numpy()
@@ -54,7 +54,7 @@ class PI0FastConfig(PreTrainedConfig):

    tokenizer_max_length: int = 200  # see openpi `__post_init__`
    text_tokenizer_name: str = "google/paligemma-3b-pt-224"
-    action_tokenizer_name: str = "lerobot/fast-action-tokenizer"
+    action_tokenizer_name: str = "physical-intelligence/fast"
    temperature: float = 0.0
    max_decoding_steps: int = 256
    fast_skip_tokens: int = 128
@@ -38,16 +38,11 @@ else:
 if TYPE_CHECKING or _transformers_available:
    from transformers import AutoTokenizer
    from transformers.models.auto import CONFIG_MAPPING
-
-    from lerobot.policies.pi_gemma import (
-        PaliGemmaForConditionalGenerationWithPiGemma,
-        PiGemmaModel,
-    )
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
 else:
    CONFIG_MAPPING = None
+    PaliGemmaForConditionalGeneration = None
    AutoTokenizer = None
-    PiGemmaModel = None
-    PaliGemmaForConditionalGenerationWithPiGemma = None

 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig
@@ -126,7 +121,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    if images.dtype == torch.uint8:
        resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8)
    elif images.dtype == torch.float32:
-        resized_images = resized_images.clamp(0.0, 1.0)
+        resized_images = resized_images.clamp(-1.0, 1.0)
    else:
        raise ValueError(f"Unsupported image dtype: {images.dtype}")

@@ -137,7 +132,7 @@ def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
    pad_w1 = pad_w0 + remainder_w

    # Pad
-    constant_value = 0 if images.dtype == torch.uint8 else 0.0
+    constant_value = 0 if images.dtype == torch.uint8 else -1.0
    padded_images = F.pad(
        resized_images,
        (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
@@ -211,22 +206,16 @@ class PI0FastPaliGemma(nn.Module):
        vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth
        vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads
        vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh"
-        vlm_config_hf.text_config.dtype = "float32"
+        vlm_config_hf.text_config.torch_dtype = "float32"
        vlm_config_hf.text_config.vocab_size = 257152
        vlm_config_hf.text_config.use_adarms = use_adarms[0]
        vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
        vlm_config_hf.vision_config.intermediate_size = 4304
        vlm_config_hf.vision_config.projection_dim = 2048
        vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
-        vlm_config_hf.vision_config.dtype = "float32"
+        vlm_config_hf.vision_config.torch_dtype = "float32"

-        self.paligemma = PaliGemmaForConditionalGenerationWithPiGemma(config=vlm_config_hf)
-
-        # Use PI Gemma (AdaRMS) as language model when use_adarms[0] is True so that
-        # forward(..., adarms_cond=...) is supported (same as pi0/pi05).
-        if use_adarms[0]:
-            text_config = self.paligemma.config.text_config
-            self.paligemma.model.language_model = PiGemmaModel(text_config)
+        self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)

        self.to_bfloat16_for_selected_params(precision)

@@ -239,11 +228,10 @@ class PI0FastPaliGemma(nn.Module):
        else:
            raise ValueError(f"Invalid precision: {precision}")

-        # Keep full vision path in float32 so we never toggle (toggle causes optimizer
-        # "same dtype" error). Align with PI05.
        params_to_keep_float32 = [
-            "vision_tower",
-            "multi_modal_projector",
+            "vision_tower.vision_model.embeddings.patch_embedding.weight",
+            "vision_tower.vision_model.embeddings.patch_embedding.bias",
+            "vision_tower.vision_model.embeddings.position_embedding.weight",
            "input_layernorm",
            "post_attention_layernorm",
            "model.norm",
@@ -254,18 +242,10 @@ class PI0FastPaliGemma(nn.Module):
                param.data = param.data.to(dtype=torch.float32)

    def embed_image(self, image: torch.Tensor):
-        # Vision tower and multi_modal_projector are kept in float32 (params_to_keep_float32). Align with PI05.
-        out_dtype = image.dtype
-        if image.dtype != torch.float32:
-            image = image.to(torch.float32)
-        image_outputs = self.paligemma.model.get_image_features(image)
-        features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
-        if features.dtype != out_dtype:
-            features = features.to(out_dtype)
-        return features
+        return self.paligemma.model.get_image_features(image)

    def embed_language_tokens(self, tokens: torch.Tensor):
-        return self.paligemma.model.language_model.embed_tokens(tokens)
+        return self.paligemma.language_model.embed_tokens(tokens)

    def forward(
        self,
@@ -279,7 +259,7 @@ class PI0FastPaliGemma(nn.Module):
        if adarms_cond is None:
            adarms_cond = [None, None]
        if inputs_embeds[1] is None:
-            prefix_output = self.paligemma.model.language_model.forward(
+            prefix_output = self.paligemma.language_model.forward(
                inputs_embeds=inputs_embeds[0],
                attention_mask=attention_mask,
                position_ids=position_ids,
@@ -326,14 +306,24 @@ class PI0FastPytorch(nn.Module):  # see openpi `PI0Pytorch`
            self.sample_actions_fast = torch.compile(self.sample_actions_fast, mode=config.compile_mode)
            self.forward = torch.compile(self.forward, mode=config.compile_mode)

+        msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues"""
+
+        try:
+            from transformers.models.siglip import check
+
+            if not check.check_whether_transformers_replace_is_installed_correctly():
+                raise ValueError(msg)
+        except ImportError:
+            raise ValueError(msg) from None
+
    def gradient_checkpointing_enable(self):
        """Enable gradient checkpointing for memory optimization."""
        self.gradient_checkpointing_enabled = True
        # Call the proper gradient_checkpointing_enable() method with use_reentrant=False for better memory efficiency
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing_enable(
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing_enable(
            gradient_checkpointing_kwargs={"use_reentrant": False}
        )
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing_enable(
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing_enable(
            gradient_checkpointing_kwargs={"use_reentrant": False}
        )
        logging.info("Enabled gradient checkpointing for PI0FastPytorch model")
@@ -342,8 +332,8 @@ class PI0FastPytorch(nn.Module):  # see openpi `PI0Pytorch`
        """Disable gradient checkpointing."""
        self.gradient_checkpointing_enabled = False
        # Call the proper gradient_checkpointing_disable() method
-        self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing_disable()
-        self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing_disable()
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing_disable()
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing_disable()
        logging.info("Disabled gradient checkpointing for PI0FastPytorch model")

    def _apply_checkpoint(self, func, *args, **kwargs):
@@ -533,7 +523,7 @@ class PI0FastPytorch(nn.Module):  # see openpi `PI0Pytorch`

        # Convert embeddings to bfloat16 if needed
        if (
-            self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
            == torch.bfloat16
        ):
            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
@@ -626,7 +616,7 @@ class PI0FastPytorch(nn.Module):  # see openpi `PI0Pytorch`
        )

        if (
-            self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
            == torch.bfloat16
        ):
            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
@@ -724,7 +714,7 @@ class PI0FastPytorch(nn.Module):  # see openpi `PI0Pytorch`

        # Ensure correct precision (bfloat16/float32)
        if (
-            self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
            == torch.bfloat16
        ):
            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
@@ -907,12 +897,14 @@ class PI0FastPolicy(PreTrainedPolicy):
        # Check if dataset_stats were provided in kwargs
        model = cls(config, **kwargs)

-        # Load state dict (expects keys with "model." prefix)
+        # Now manually load and remap the state dict
        try:
+            # Try to load the pytorch_model.bin or model.safetensors file
            print(f"Loading model from: {pretrained_name_or_path}")
            try:
                from transformers.utils import cached_file

+                # Try safetensors first
                resolved_file = cached_file(
                    pretrained_name_or_path,
                    "model.safetensors",
@@ -920,7 +912,7 @@ class PI0FastPolicy(PreTrainedPolicy):
                    force_download=kwargs.get("force_download", False),
                    resume_download=kwargs.get("resume_download"),
                    proxies=kwargs.get("proxies"),
-                    token=kwargs.get("token"),
+                    use_auth_token=kwargs.get("use_auth_token"),
                    revision=kwargs.get("revision"),
                    local_files_only=kwargs.get("local_files_only", False),
                )
@@ -933,9 +925,8 @@ class PI0FastPolicy(PreTrainedPolicy):
                print("Returning model without loading pretrained weights")
                return model

-            # First, fix any key differences (see openpi model.py, _fix_pytorch_state_dict_keys)
+            # First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys`
            fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config)
-
            # Then add "model." prefix for all keys that don't already have it
            remapped_state_dict = {}
            remap_count = 0
@@ -945,6 +936,8 @@ class PI0FastPolicy(PreTrainedPolicy):
                    new_key = f"model.{key}"
                    remapped_state_dict[new_key] = value
                    remap_count += 1
+                    if remap_count <= 10:  # Only print first 10 to avoid spam
+                        print(f"Remapped: {key} -> {new_key}")
                else:
                    remapped_state_dict[key] = value

@@ -978,7 +971,7 @@ class PI0FastPolicy(PreTrainedPolicy):
                print("All keys loaded successfully!")

        except Exception as e:
-            print(f"Warning: Could not load state dict: {e}")
+            print(f"Warning: Could not remap state dict keys: {e}")

        return model

@@ -23,6 +23,7 @@ import torch

 from lerobot.configs.types import PipelineFeatureType, PolicyFeature
 from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig
+from lerobot.policies.pi0_fast.modeling_pi0_fast import pad_vector
 from lerobot.processor import (
    ActionTokenizerProcessorStep,
    AddBatchDimensionProcessorStep,
@@ -68,6 +69,9 @@ class Pi0FastPrepareStateAndLanguageTokenizerProcessorStep(ProcessorStep):
        # TODO: check if this necessary
        state = deepcopy(state)

+        # Prepare state (pad to max_state_dim)
+        state = pad_vector(state, self.max_state_dim)
+
        # State should already be normalized to [-1, 1] by the NormalizerProcessorStep that runs before this step
        # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`)
        state_np = state.cpu().numpy()
@@ -1,363 +0,0 @@
-# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import torch
-from torch import nn
-
-from lerobot.utils.import_utils import _transformers_available
-
-if TYPE_CHECKING or _transformers_available:
-    from transformers.cache_utils import DynamicCache
-    from transformers.masking_utils import create_causal_mask
-    from transformers.modeling_layers import GradientCheckpointingLayer
-    from transformers.modeling_outputs import BaseModelOutputWithPast
-    from transformers.models.gemma.modeling_gemma import (
-        GemmaAttention,
-        GemmaConfig,
-        GemmaForCausalLM,
-        GemmaMLP,
-        GemmaModel,
-    )
-    from transformers.models.paligemma.modeling_paligemma import (
-        PaliGemmaForConditionalGeneration,
-        PaliGemmaModel,
-    )
-else:
-    GemmaAttention = None
-    GemmaConfig = None
-    GemmaForCausalLM = None
-    GemmaMLP = None
-    GemmaModel = None
-    PaliGemmaModel = None
-    PaliGemmaForConditionalGeneration = None
-    DynamicCache = None
-    GradientCheckpointingLayer = None
-    BaseModelOutputWithPast = None
-    create_causal_mask = None
-
-
-def _gated_residual(
-    x: torch.Tensor | None,
-    y: torch.Tensor | None,
-    gate: torch.Tensor | None,
-) -> torch.Tensor | None:
-    """Gated residual: x + y when gate is None, else x + y * gate."""
-    if x is None and y is None:
-        return None
-    if x is None or y is None:
-        return x if x is not None else y
-    if gate is None:
-        return x + y
-    return x + y * gate
-
-
-def layernorm_forward(
-    layernorm: nn.Module,
-    x: torch.Tensor,
-    cond: torch.Tensor | None = None,
-):
-    """
-    call layernorm and return hidden states and gate
-    if cond is not None, use conditional norm
-    otherwise, use normal gemma norm
-    """
-    if cond is not None:
-        return layernorm(x, cond=cond)
-    else:
-        return layernorm(x)
-
-
-class PiGemmaRMSNorm(nn.Module):
-    """
-    Adaptive RMSNorm for PI Gemma (AdaRMS).
-    When cond_dim is set, uses cond to modulate scale/shift/gate; otherwise behaves like standard GemmaRMSNorm.
-    forward(x, cond=None) returns (output, gate) for use with _gated_residual.
-    """
-
-    def __init__(self, dim: int, eps: float = 1e-6, cond_dim: int | None = None):
-        super().__init__()
-        self.eps = eps
-        self.dim = dim
-        self.cond_dim = cond_dim
-        if cond_dim is not None:
-            self.dense = nn.Linear(cond_dim, dim * 3, bias=True)
-            nn.init.zeros_(self.dense.weight)
-        else:
-            self.weight = nn.Parameter(torch.zeros(dim))
-            self.dense = None
-
-    def _norm(self, x):
-        # Compute variance in float32 (like the source implementation)
-        var = torch.mean(torch.square(x.float()), dim=-1, keepdim=True)
-        # Compute normalization in float32
-        normed_inputs = x * torch.rsqrt(var + self.eps)
-        return normed_inputs
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        cond: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        dtype = x.dtype
-        normed = self._norm(x)
-        if cond is None or self.dense is None:
-            normed = normed * (1.0 + self.weight.float())
-            return normed.type_as(x), None
-        if cond.shape[-1] != self.cond_dim:
-            raise ValueError(f"Expected cond dim {self.cond_dim}, got {cond.shape[-1]}")
-        modulation = self.dense(cond)
-        if len(x.shape) == 3:
-            modulation = modulation.unsqueeze(1)
-        scale, shift, gate = modulation.chunk(3, dim=-1)
-        normed = normed * (1 + scale.float()) + shift.float()
-        return normed.to(dtype), gate.to(dtype)
-
-    def extra_repr(self) -> str:
-        if self.dense is not None:
-            return f"dim={self.dim}, eps={self.eps}, adaptive=True, cond_dim={self.cond_dim}"
-        return f"dim={self.dim}, eps={self.eps}"
-
-
-def _get_pi_gemma_decoder_layer_base():
-    """base for PiGemmaDecoderLayer"""
-
-    class _PiGemmaDecoderLayerBase(GradientCheckpointingLayer):
-        """Decoder layer that uses PiGemmaRMSNorm and _gated_residual, compatible with v5 Gemma."""
-
-        def __init__(self, config: GemmaConfig, layer_idx: int):
-            super().__init__()
-            self.hidden_size = config.hidden_size
-            self.self_attn = GemmaAttention(config=config, layer_idx=layer_idx)
-            self.mlp = GemmaMLP(config)
-            cond_dim = (
-                getattr(config, "adarms_cond_dim", None) if getattr(config, "use_adarms", False) else None
-            )
-            self.input_layernorm = PiGemmaRMSNorm(
-                config.hidden_size, eps=config.rms_norm_eps, cond_dim=cond_dim
-            )
-            self.post_attention_layernorm = PiGemmaRMSNorm(
-                config.hidden_size, eps=config.rms_norm_eps, cond_dim=cond_dim
-            )
-
-        def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: torch.Tensor | None = None,
-            position_ids: torch.LongTensor | None = None,
-            past_key_values=None,
-            use_cache: bool = False,
-            cache_position: torch.LongTensor | None = None,
-            position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-            adarms_cond: torch.Tensor | None = None,
-            **kwargs,
-        ) -> torch.Tensor:
-            residual = hidden_states
-            hidden_states, gate = self.input_layernorm(hidden_states, cond=adarms_cond)
-            hidden_states, _ = self.self_attn(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-                **kwargs,
-            )
-
-            hidden_states = _gated_residual(residual, hidden_states, gate)
-
-            residual = hidden_states
-            hidden_states, gate = self.post_attention_layernorm(hidden_states, cond=adarms_cond)
-            hidden_states = self.mlp(hidden_states)
-            hidden_states = _gated_residual(residual, hidden_states, gate)
-            return hidden_states
-
-    return _PiGemmaDecoderLayerBase
-
-
-class PiGemmaModel(GemmaModel):  # type: ignore[misc]
-    """
-    GemmaModel extended with AdaRMS (adaptive RMSNorm) and gated residuals when config.use_adarms is True.
-    """
-
-    def __init__(self, config: GemmaConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        # if not getattr(config, "use_adarms", False):
-        #     return
-        cond_dim = getattr(config, "adarms_cond_dim", None)
-        pi_gemma_decoder_layer_base = _get_pi_gemma_decoder_layer_base()
-        self.layers = nn.ModuleList(
-            [pi_gemma_decoder_layer_base(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = PiGemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps, cond_dim=cond_dim)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: DynamicCache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        use_cache: bool | None = None,
-        output_attentions: bool | None = None,
-        output_hidden_states: bool | None = None,
-        cache_position: torch.LongTensor | None = None,
-        adarms_cond: torch.Tensor | None = None,
-        **kwargs,
-    ) -> BaseModelOutputWithPast:
-        """
-        adarms_cond (`torch.Tensor` of shape `(batch_size, cond_dim)`, *optional*):
-            Condition for ADARMS.
-        """
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            import logging
-
-            logging.warning(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
-
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        # embed positions
-        hidden_states = inputs_embeds
-        # Convert to bfloat16 if the first layer uses bfloat16
-        if len(self.layers) > 0 and self.layers[0].self_attn.q_proj.weight.dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(torch.bfloat16)
-
-        # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-        # normalized
-        # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
-        # See https://github.com/huggingface/transformers/pull/29402
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-                adarms_cond=adarms_cond,
-                **kwargs,
-            )
-
-            hidden_states = layer_outputs
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states, _ = self.norm(hidden_states, adarms_cond)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class PiGemmaForCausalLM(GemmaForCausalLM):  # type: ignore[misc]
-    """
-    Causal LM wrapper using PiGemmaModel as the backbone, for consistency with GemmaForCausalLM
-    and the language model used in pi0_fast. Use this for the action expert in pi0/pi05.
-    """
-
-    def __init__(self, config: GemmaConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = PiGemmaModel(config)
-
-
-class PaliGemmaModelWithPiGemma(PaliGemmaModel):
-    """PaliGemmaModel whose language_model is PiGemmaModel (custom decoder with PiGemmaRMSNorm and gated residuals)."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.language_model = PiGemmaModel(config.text_config)
-
-
-class PaliGemmaForConditionalGenerationWithPiGemma(PaliGemmaForConditionalGeneration):
-    """PaliGemmaForConditionalGeneration using PiGemma decoder for the language model."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = PaliGemmaModelWithPiGemma(config)
-
-    # Make modules available through conditional class for BC
-    @property
-    def language_model(self):
-        return self.model.language_model
-
-
-__all__ = [
-    "PiGemmaModel",
-    "PiGemmaForCausalLM",
-    "PiGemmaRMSNorm",
-    "_gated_residual",
-    "layernorm_forward",
-    "PaliGemmaModelWithPiGemma",
-    "PaliGemmaForConditionalGenerationWithPiGemma",
-]
@@ -33,7 +33,7 @@ class RewardClassifierConfig(PreTrainedConfig):
    latent_dim: int = 256
    image_embedding_pooling_dim: int = 8
    dropout_rate: float = 0.1
-    model_name: str = "helper2424/resnet10"  # TODO: This needs to be updated. The model on the Hub doesn't call self.post_init() in its __init__, which is required by transformers v5 to set all_tied_weights_keys. The from_pretrained call fails when it tries to access this attribute during _finalize_model_loading.
+    model_name: str = "helper2424/resnet10"
    device: str = "cpu"
    model_type: str = "cnn"  # "transformer" or "cnn"
    num_cameras: int = 2
@@ -85,7 +85,7 @@ class SmolVLAConfig(PreTrainedConfig):
    scheduler_decay_lr: float = 2.5e-6

    vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"  # Select the VLM backbone.
-    load_vlm_weights: bool = False  # Set to False in case of training the expert from scratch. True when init from pretrained SmolVLA weights
+    load_vlm_weights: bool = False  # Set to True in case of training the expert from scratch. True when init from pretrained SmolVLA weights

    add_image_special_tokens: bool = False  # Whether to use special image tokens around image features.

@@ -261,15 +261,10 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration):
    and optional LoRA fine-tuning support.
    """

-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    config_class = Qwen2_5_VLConfig
    _no_split_modules = ["Qwen2_5_VLDecoderLayer_with_MoE", "Qwen2_5_VLVisionBlock"]

-    def init_weights(self):
-        if getattr(self.model, "language_model", None) is not None:
-            return
-        super().init_weights()
-
    @classmethod
    def from_pretrained(
        cls,
@@ -317,11 +312,6 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration):
            processor.action_processor = action_tokenizer
        else:
            action_tokenizer = None
-
-        # add pad_token_id to config
-        config.pad_token_id = processor.tokenizer.pad_token_id
-        config.text_config.pad_token_id = processor.tokenizer.pad_token_id
-
        # Initialize model with configuration and processor
        model = cls(config, processor=processor, action_tokenizer=action_tokenizer, **kwargs)

@@ -341,7 +331,7 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration):
                force_download=kwargs.get("force_download", False),
                resume_download=kwargs.get("resume_download"),
                proxies=kwargs.get("proxies"),
-                token=kwargs.get("token"),
+                use_auth_token=kwargs.get("use_auth_token"),
                revision=kwargs.get("revision"),
                local_files_only=kwargs.get("local_files_only", False),
            )
@@ -21,7 +21,6 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
        window_size=112,
        out_hidden_size=3584,
        fullatt_block_indexes=[7, 15, 23, 31],
-        initializer_range=0.02,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -39,7 +38,6 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
        self.window_size = window_size
        self.fullatt_block_indexes = fullatt_block_indexes
        self.out_hidden_size = out_hidden_size
-        self.initializer_range = initializer_range


 class Qwen2_5_VLConfig(PretrainedConfig):
@@ -11,6 +11,7 @@ from transformers.activations import ACT2FN
 from transformers.cache_utils import (
    Cache,
    DynamicCache,
+    SlidingWindowCache,
    StaticCache,
 )
 from transformers.generation import GenerationMixin
@@ -30,15 +31,6 @@ from transformers.utils import (

 from .configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig

-
-# TODO(Steven): SlidingWindowCache was removed in transformers v5. Define a placeholder so isinstance checks
-# always return False (which is the correct behavior when no sliding window cache is in use).
-class _SlidingWindowCachePlaceholder:
-    pass
-
-
-SlidingWindowCache = _SlidingWindowCachePlaceholder
-
 if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.layers.rotary import apply_rotary_emb
@@ -602,40 +594,19 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
        return hidden_states


-def _compute_default_rope_parameters_qwen2_5_vl(config, device=None):
-    """
-    compute default rope parameters for Qwen2_5_VL
-    """
-    base = config.text_config.rope_parameters["rope_theta"]
-    dim = config.hidden_size // config.num_attention_heads
-    inv_freq = 1.0 / (
-        base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
-    )
-    return inv_freq, 1.0
-
-
 class Qwen2_5_VLRotaryEmbedding(nn.Module):
    def __init__(self, config: Qwen2_5_VLConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        elif hasattr(config, "rope_parameters") and config.rope_parameters is not None:
-            self.rope_type = config.rope_parameters.get("rope_type", "default")
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
-
-        if self.rope_type == "default":
-            self.rope_init_fn = _compute_default_rope_parameters_qwen2_5_vl
-            self.rope_kwargs = {}
-        else:
-            rope_type_key = "linear" if self.rope_type == "linear" else self.rope_type
-            self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type_key]
-            self.rope_kwargs = {}
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
@@ -144,7 +144,7 @@ def preprocesser_call(
    """
    # Process image inputs
    if images is not None and len(images) > 0:
-        image_inputs = processor.image_processor(images=images, return_tensors=return_tensors)
+        image_inputs = processor.image_processor(images=images, videos=None, return_tensors=return_tensors)
        image_grid_thw = image_inputs["image_grid_thw"]
    else:
        image_inputs = {}
@@ -152,7 +152,7 @@ def preprocesser_call(

    # Process video inputs
    if videos is not None:
-        videos_inputs = processor.image_processor(videos=videos, return_tensors=return_tensors)
+        videos_inputs = processor.image_processor(images=None, videos=videos, return_tensors=return_tensors)
        video_grid_thw = videos_inputs["video_grid_thw"]
    else:
        videos_inputs = {}
@@ -13,9 +13,12 @@
 import warnings

 from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging

 """ Florence-2 configuration"""

+logger = logging.get_logger(__name__)
+

 class Florence2VisionConfig(PretrainedConfig):
    r"""
@@ -273,8 +276,6 @@ class Florence2LanguageConfig(PretrainedConfig):
        )

        # ensure backward compatibility for BART CNN models
-        if not hasattr(self, "forced_bos_token_id"):
-            self.forced_bos_token_id = None
        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
            self.forced_bos_token_id = self.bos_token_id
            warnings.warn(
@@ -46,6 +46,7 @@ from transformers.utils import (
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
+    logging,
    replace_return_docstrings,
 )

@@ -56,6 +57,8 @@ if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

+logger = logging.get_logger(__name__)
+
 _CONFIG_FOR_DOC = "Florence2Config"


@@ -989,6 +992,12 @@ class Florence2FlashAttention2(Florence2Attention):
            else:
                target_dtype = self.q_proj.weight.dtype

+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)
@@ -1126,6 +1135,11 @@ class Florence2SdpaAttention(Florence2Attention):
    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
        """Input shape: Batch x Time x Channel"""
        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Florence2Model is using Florence2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
            return super().forward(
                hidden_states,
                key_value_states=key_value_states,
@@ -1846,6 +1860,9 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
            use_cache = False

        # decoder layers
@@ -1934,10 +1951,7 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):


 class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
-    _tied_weights_keys = {
-        "encoder.embed_tokens.weight": "shared.weight",
-        "decoder.embed_tokens.weight": "shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: Florence2LanguageConfig):
        super().__init__(config)
@@ -2062,10 +2076,7 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):

 class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
    base_model_prefix = "model"
-    _tied_weights_keys = {
-        "model.encoder.embed_tokens.weight": "model.shared.weight",
-        "model.decoder.embed_tokens.weight": "model.shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]

    def __init__(self, config: Florence2LanguageConfig):
@@ -2143,6 +2154,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
@@ -2423,10 +2436,11 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
    FLORENCE2_START_DOCSTRING,
 )
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
-    _tied_weights_keys = {
-        "language_model.model.encoder.embed_tokens.weight": "language_model.model.shared.weight",
-        "language_model.model.decoder.embed_tokens.weight": "language_model.model.shared.weight",
-    }
+    _tied_weights_keys = [
+        "language_model.encoder.embed_tokens.weight",
+        "language_model.decoder.embed_tokens.weight",
+        "language_model.lm_head.weight",
+    ]

    def __init__(self, config: Florence2Config):
        super().__init__(config)
@@ -336,7 +336,7 @@ class ActionTokenizerProcessorStep(ActionProcessorStep):
    Requires the `transformers` library to be installed.

    Attributes:
-        tokenizer_name: The name of a pretrained processor from the Hugging Face Hub (e.g., "lerobot/fast-action-tokenizer").
+        tokenizer_name: The name of a pretrained processor from the Hugging Face Hub (e.g., "physical-intelligence/fast").
        tokenizer: A pre-initialized processor/tokenizer object. If provided, `tokenizer_name` is ignored.
        trust_remote_code: Whether to trust remote code when loading the tokenizer (required for some tokenizers).
        action_tokenizer: The internal tokenizer/processor instance, loaded during initialization.
@@ -26,10 +26,8 @@ lerobot-record \
    --dataset.repo_id=<my_username>/<my_dataset_name> \
    --dataset.num_episodes=2 \
    --dataset.single_task="Grab the cube" \
-    --dataset.streaming_encoding=true \
-    --dataset.encoder_threads=2 \
    --display_data=true
-    # <- Optional: specify video codec (auto, h264, hevc, libsvtav1). Default is libsvtav1. \
+    # <- Optional: specify video codec (h264, hevc, libsvtav1). Default is libsvtav1. \
    # --dataset.vcodec=h264 \
    # <- Teleop optional if you want to teleoperate to record or in between episodes with a policy \
    # --teleop.type=so100_leader \
@@ -60,10 +58,7 @@ lerobot-record \
  --display_data=true \
  --dataset.repo_id=${HF_USER}/bimanual-so-handover-cube \
  --dataset.num_episodes=25 \
-  --dataset.single_task="Grab and handover the red cube to the other arm" \
-  --dataset.streaming_encoding=true \
-  # --dataset.vcodec=auto \
-  --dataset.encoder_threads=2
+  --dataset.single_task="Grab and handover the red cube to the other arm"
 ```
 """

@@ -184,19 +179,9 @@ class DatasetRecordConfig:
    # Number of episodes to record before batch encoding videos
    # Set to 1 for immediate encoding (default behavior), or higher for batched encoding
    video_encoding_batch_size: int = 1
-    # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
-    # or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
-    # Use 'auto' to auto-detect the best available hardware encoder.
+    # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1'.
+    # Use 'h264' for faster encoding on systems where AV1 encoding is CPU-heavy.
    vcodec: str = "libsvtav1"
-    # Enable streaming video encoding: encode frames in real-time during capture instead
-    # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
-    streaming_encoding: bool = False
-    # Maximum number of frames to buffer per camera when using streaming encoding.
-    # ~1s buffer at 30fps. Provides backpressure if the encoder can't keep up.
-    encoder_queue_maxsize: int = 30
-    # Number of threads per encoder instance. None = auto (codec default).
-    # Lower values reduce CPU usage, maps to 'lp' (via svtav1-params) for libsvtav1 and 'threads' for h264/hevc..
-    encoder_threads: int | None = None
    # Rename map for the observation to override the image and state keys
    rename_map: dict[str, str] = field(default_factory=dict)

@@ -467,9 +452,6 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
                root=cfg.dataset.root,
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                vcodec=cfg.dataset.vcodec,
-                streaming_encoding=cfg.dataset.streaming_encoding,
-                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
-                encoder_threads=cfg.dataset.encoder_threads,
            )

            if hasattr(robot, "cameras") and len(robot.cameras) > 0:
@@ -492,9 +474,6 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
                image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                vcodec=cfg.dataset.vcodec,
-                streaming_encoding=cfg.dataset.streaming_encoding,
-                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
-                encoder_threads=cfg.dataset.encoder_threads,
            )

        # Load pretrained policy
@@ -518,11 +497,6 @@ def record(cfg: RecordConfig) -> LeRobotDataset:

        listener, events = init_keyboard_listener()

-        if not cfg.dataset.streaming_encoding:
-            logging.info(
-                "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
-            )
-
        with VideoEncodingManager(dataset):
            recorded_episodes = 0
            while recorded_episodes < cfg.dataset.num_episodes and not events["stop_recording"]:
@@ -152,7 +152,6 @@ def test_motor(bus, motor_id: int, timeout: float, use_fd: bool):
    )
    try:
        bus.send(disable_msg)
-        bus.recv(timeout=0.1)  # Clear any pending responses
    except Exception:
        print(f"Error sending message to motor 0x{motor_id:02X}")

@@ -306,7 +306,7 @@ def train_fast_tokenizer(

    # download the tokenizer source code (not pretrained weights)
    # we'll train a new tokenizer on our own data
-    base_tokenizer = AutoProcessor.from_pretrained("lerobot/fast-action-tokenizer", trust_remote_code=True)
+    base_tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True)

    # convert action_chunks array to list of arrays (expected by .fit())
    action_data_list = [action_chunks[i] for i in range(len(action_chunks))]
@@ -31,6 +31,7 @@ from lerobot.configs.train import TrainPipelineConfig
 from lerobot.datasets.factory import make_dataset
 from lerobot.datasets.image_writer import image_array_to_pil_image
 from lerobot.datasets.lerobot_dataset import (
+    VALID_VIDEO_CODECS,
    LeRobotDataset,
    MultiLeRobotDataset,
    _encode_video_worker,
@@ -44,7 +45,6 @@ from lerobot.datasets.utils import (
    hf_transform_to_torch,
    hw_to_dataset_features,
 )
-from lerobot.datasets.video_utils import VALID_VIDEO_CODECS
 from lerobot.envs.factory import make_env_config
 from lerobot.policies.factory import make_policy_config
 from lerobot.robots import make_robot_from_config
@@ -393,7 +393,7 @@ def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory):
        vid_key: {"dtype": "video", "shape": DUMMY_HWC, "names": ["height", "width", "channels"]},
    }
    ds_mixed = empty_lerobot_dataset_factory(
-        root=tmp_path / "mixed", features=features_mixed, batch_encoding_size=2, streaming_encoding=False
+        root=tmp_path / "mixed", features=features_mixed, batch_encoding_size=2
    )
    ds_mixed.add_frame(
        {
@@ -1450,10 +1450,7 @@ def test_valid_video_codecs_constant():
    assert "h264" in VALID_VIDEO_CODECS
    assert "hevc" in VALID_VIDEO_CODECS
    assert "libsvtav1" in VALID_VIDEO_CODECS
-    assert "auto" in VALID_VIDEO_CODECS
-    assert "h264_videotoolbox" in VALID_VIDEO_CODECS
-    assert "h264_nvenc" in VALID_VIDEO_CODECS
-    assert len(VALID_VIDEO_CODECS) == 10
+    assert len(VALID_VIDEO_CODECS) == 3


 def test_delta_timestamps_with_episodes_filter(tmp_path, empty_lerobot_dataset_factory):
@@ -1,730 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for streaming video encoding and hardware-accelerated encoding."""
-
-import queue
-import threading
-from unittest.mock import patch
-
-import av
-import numpy as np
-import pytest
-
-from lerobot.datasets.video_utils import (
-    VALID_VIDEO_CODECS,
-    StreamingVideoEncoder,
-    _CameraEncoderThread,
-    _get_codec_options,
-    detect_available_hw_encoders,
-    resolve_vcodec,
-)
-from lerobot.utils.constants import OBS_IMAGES
-
-# ─── _get_codec_options tests ───
-
-
-class TestGetCodecOptions:
-    def test_libsvtav1_defaults(self):
-        opts = _get_codec_options("libsvtav1")
-        assert opts["g"] == "2"
-        assert opts["crf"] == "30"
-        assert opts["preset"] == "12"
-
-    def test_libsvtav1_custom_preset(self):
-        opts = _get_codec_options("libsvtav1", preset=8)
-        assert opts["preset"] == "8"
-
-    def test_h264_options(self):
-        opts = _get_codec_options("h264", g=10, crf=23)
-        assert opts["g"] == "10"
-        assert opts["crf"] == "23"
-        assert "preset" not in opts
-
-    def test_videotoolbox_options(self):
-        opts = _get_codec_options("h264_videotoolbox", g=2, crf=30)
-        assert opts["g"] == "2"
-        # CRF 30 maps to quality = max(1, min(100, 100 - 30*2)) = 40
-        assert opts["q:v"] == "40"
-        assert "crf" not in opts
-
-    def test_nvenc_options(self):
-        opts = _get_codec_options("h264_nvenc", g=2, crf=25)
-        assert opts["rc"] == "constqp"
-        assert opts["qp"] == "25"
-        assert "crf" not in opts
-        # NVENC doesn't support g
-        assert "g" not in opts
-
-    def test_vaapi_options(self):
-        opts = _get_codec_options("h264_vaapi", crf=28)
-        assert opts["qp"] == "28"
-
-    def test_qsv_options(self):
-        opts = _get_codec_options("h264_qsv", crf=25)
-        assert opts["global_quality"] == "25"
-
-    def test_no_g_no_crf(self):
-        opts = _get_codec_options("h264", g=None, crf=None)
-        assert "g" not in opts
-        assert "crf" not in opts
-
-
-# ─── HW encoder detection tests ───
-
-
-class TestHWEncoderDetection:
-    def test_detect_available_hw_encoders_returns_list(self):
-        result = detect_available_hw_encoders()
-        assert isinstance(result, list)
-
-    def test_detect_available_hw_encoders_only_valid(self):
-        from lerobot.datasets.video_utils import HW_ENCODERS
-
-        result = detect_available_hw_encoders()
-        for encoder in result:
-            assert encoder in HW_ENCODERS
-
-    def test_resolve_vcodec_passthrough(self):
-        assert resolve_vcodec("libsvtav1") == "libsvtav1"
-        assert resolve_vcodec("h264") == "h264"
-
-    def test_resolve_vcodec_auto_fallback(self):
-        """When no HW encoders are available, auto should fall back to libsvtav1."""
-        with patch("lerobot.datasets.video_utils.detect_available_hw_encoders", return_value=[]):
-            assert resolve_vcodec("auto") == "libsvtav1"
-
-    def test_resolve_vcodec_auto_picks_hw(self):
-        """When a HW encoder is available, auto should pick it."""
-        with patch(
-            "lerobot.datasets.video_utils.detect_available_hw_encoders",
-            return_value=["h264_videotoolbox"],
-        ):
-            assert resolve_vcodec("auto") == "h264_videotoolbox"
-
-    def test_resolve_vcodec_auto_returns_valid(self):
-        """Test that resolve_vcodec('auto') returns a known valid codec."""
-        result = resolve_vcodec("auto")
-        assert result in VALID_VIDEO_CODECS
-
-    def test_hw_encoder_names_accepted_in_validation(self):
-        """Test that HW encoder names pass validation in VALID_VIDEO_CODECS."""
-        assert "auto" in VALID_VIDEO_CODECS
-        assert "h264_videotoolbox" in VALID_VIDEO_CODECS
-        assert "h264_nvenc" in VALID_VIDEO_CODECS
-
-    def test_resolve_vcodec_invalid_raises(self):
-        """Test that resolve_vcodec raises ValueError for invalid codecs."""
-        with pytest.raises(ValueError, match="Invalid vcodec"):
-            resolve_vcodec("not_a_real_codec")
-
-
-# ─── _CameraEncoderThread tests ───
-
-
-class TestCameraEncoderThread:
-    def test_encodes_valid_mp4(self, tmp_path):
-        """Test that the encoder thread creates a valid MP4 file with correct frame count."""
-        num_frames = 30
-        height, width = 64, 96
-        fps = 30
-        video_path = tmp_path / "test_output" / "test.mp4"
-
-        frame_queue: queue.Queue = queue.Queue(maxsize=60)
-        result_queue: queue.Queue = queue.Queue(maxsize=1)
-        stop_event = threading.Event()
-
-        encoder_thread = _CameraEncoderThread(
-            video_path=video_path,
-            fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
-            frame_queue=frame_queue,
-            result_queue=result_queue,
-            stop_event=stop_event,
-        )
-        encoder_thread.start()
-
-        # Feed frames (HWC uint8)
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
-            frame_queue.put(frame)
-
-        # Send sentinel
-        frame_queue.put(None)
-        encoder_thread.join(timeout=60)
-        assert not encoder_thread.is_alive()
-
-        # Check result
-        status, data = result_queue.get(timeout=5)
-        assert status == "ok"
-        assert data is not None  # Stats should be returned
-        assert "mean" in data
-        assert "std" in data
-        assert "min" in data
-        assert "max" in data
-        assert "count" in data
-
-        # Verify the MP4 file is valid
-        assert video_path.exists()
-        with av.open(str(video_path)) as container:
-            stream = container.streams.video[0]
-            # The frame count should match
-            total_frames = sum(1 for _ in container.decode(stream))
-        assert total_frames == num_frames
-
-    def test_handles_chw_input(self, tmp_path):
-        """Test that CHW format input is handled correctly."""
-        num_frames = 5
-        fps = 30
-        video_path = tmp_path / "test_chw" / "test.mp4"
-
-        frame_queue: queue.Queue = queue.Queue(maxsize=60)
-        result_queue: queue.Queue = queue.Queue(maxsize=1)
-        stop_event = threading.Event()
-
-        encoder_thread = _CameraEncoderThread(
-            video_path=video_path,
-            fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
-            frame_queue=frame_queue,
-            result_queue=result_queue,
-            stop_event=stop_event,
-        )
-        encoder_thread.start()
-
-        # Feed CHW frames
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (3, 64, 96), dtype=np.uint8)
-            frame_queue.put(frame)
-
-        frame_queue.put(None)
-        encoder_thread.join(timeout=60)
-
-        status, _ = result_queue.get(timeout=5)
-        assert status == "ok"
-        assert video_path.exists()
-
-    def test_stop_event_cancellation(self, tmp_path):
-        """Test that setting the stop event causes the thread to exit."""
-        fps = 30
-        video_path = tmp_path / "test_cancel" / "test.mp4"
-
-        frame_queue: queue.Queue = queue.Queue(maxsize=60)
-        result_queue: queue.Queue = queue.Queue(maxsize=1)
-        stop_event = threading.Event()
-
-        encoder_thread = _CameraEncoderThread(
-            video_path=video_path,
-            fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
-            frame_queue=frame_queue,
-            result_queue=result_queue,
-            stop_event=stop_event,
-        )
-        encoder_thread.start()
-
-        # Feed a few frames
-        for _ in range(3):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            frame_queue.put(frame)
-
-        # Signal stop instead of sending sentinel
-        stop_event.set()
-        encoder_thread.join(timeout=10)
-        assert not encoder_thread.is_alive()
-
-
-# ─── StreamingVideoEncoder tests ───
-
-
-class TestStreamingVideoEncoder:
-    def test_single_camera_episode(self, tmp_path):
-        """Test encoding a single camera episode."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
-
-        video_keys = [f"{OBS_IMAGES}.laptop"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        num_frames = 20
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.laptop", frame)
-
-        results = encoder.finish_episode()
-        assert f"{OBS_IMAGES}.laptop" in results
-
-        mp4_path, stats = results[f"{OBS_IMAGES}.laptop"]
-        assert mp4_path.exists()
-        assert stats is not None
-
-        # Verify frame count
-        with av.open(str(mp4_path)) as container:
-            stream = container.streams.video[0]
-            total_frames = sum(1 for _ in container.decode(stream))
-        assert total_frames == num_frames
-
-        encoder.close()
-
-    def test_multi_camera_episode(self, tmp_path):
-        """Test encoding multiple cameras simultaneously."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
-        video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        num_frames = 15
-        for _ in range(num_frames):
-            frame0 = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            frame1 = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(video_keys[0], frame0)
-            encoder.feed_frame(video_keys[1], frame1)
-
-        results = encoder.finish_episode()
-
-        for key in video_keys:
-            assert key in results
-            mp4_path, stats = results[key]
-            assert mp4_path.exists()
-            assert stats is not None
-
-        encoder.close()
-
-    def test_sequential_episodes(self, tmp_path):
-        """Test that multiple sequential episodes work correctly."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-        video_keys = [f"{OBS_IMAGES}.cam"]
-
-        for ep in range(3):
-            encoder.start_episode(video_keys, tmp_path)
-            num_frames = 10 + ep * 5
-            for _ in range(num_frames):
-                frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-                encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-            results = encoder.finish_episode()
-
-            mp4_path, stats = results[f"{OBS_IMAGES}.cam"]
-            assert mp4_path.exists()
-
-            with av.open(str(mp4_path)) as container:
-                stream = container.streams.video[0]
-                total_frames = sum(1 for _ in container.decode(stream))
-            assert total_frames == num_frames
-
-        encoder.close()
-
-    def test_cancel_episode(self, tmp_path):
-        """Test that canceling an episode cleans up properly."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-        video_keys = [f"{OBS_IMAGES}.cam"]
-
-        encoder.start_episode(video_keys, tmp_path)
-
-        for _ in range(5):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-
-        encoder.cancel_episode()
-
-        # Should be able to start a new episode after cancel
-        encoder.start_episode(video_keys, tmp_path)
-        for _ in range(5):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-        results = encoder.finish_episode()
-
-        assert f"{OBS_IMAGES}.cam" in results
-        encoder.close()
-
-    def test_feed_without_start_raises(self, tmp_path):
-        """Test that feeding frames without starting an episode raises."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
-        with pytest.raises(RuntimeError, match="No active episode"):
-            encoder.feed_frame("cam", np.zeros((64, 96, 3), dtype=np.uint8))
-        encoder.close()
-
-    def test_finish_without_start_raises(self, tmp_path):
-        """Test that finishing without starting raises."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
-        with pytest.raises(RuntimeError, match="No active episode"):
-            encoder.finish_episode()
-        encoder.close()
-
-    def test_close_is_idempotent(self, tmp_path):
-        """Test that close() can be called multiple times safely."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
-        encoder.close()
-        encoder.close()  # Should not raise
-
-    def test_video_duration_matches_frame_count(self, tmp_path):
-        """Test that encoded video duration matches num_frames / fps."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
-        video_keys = [f"{OBS_IMAGES}.cam"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        num_frames = 90  # 3 seconds at 30fps
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-
-        results = encoder.finish_episode()
-        mp4_path, _ = results[f"{OBS_IMAGES}.cam"]
-
-        expected_duration = num_frames / 30.0  # 3.0 seconds
-
-        with av.open(str(mp4_path)) as container:
-            stream = container.streams.video[0]
-            total_frames = sum(1 for _ in container.decode(stream))
-            if stream.duration is not None:
-                actual_duration = float(stream.duration * stream.time_base)
-            else:
-                actual_duration = float(container.duration / av.time_base)
-
-        assert total_frames == num_frames
-        # Allow small tolerance for duration due to codec framing
-        assert abs(actual_duration - expected_duration) < 0.5, (
-            f"Video duration {actual_duration:.2f}s != expected {expected_duration:.2f}s"
-        )
-
-        encoder.close()
-
-    def test_multi_camera_start_episode_called_once(self, tmp_path):
-        """Test that with multiple cameras, no frames are lost due to double start_episode."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
-        video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        num_frames = 30
-        for _ in range(num_frames):
-            frame0 = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            frame1 = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(video_keys[0], frame0)
-            encoder.feed_frame(video_keys[1], frame1)
-
-        results = encoder.finish_episode()
-
-        # Both cameras should have all frames
-        for key in video_keys:
-            mp4_path, stats = results[key]
-            assert mp4_path.exists()
-            with av.open(str(mp4_path)) as container:
-                stream = container.streams.video[0]
-                total_frames = sum(1 for _ in container.decode(stream))
-            assert total_frames == num_frames, (
-                f"Camera {key}: expected {num_frames} frames, got {total_frames}"
-            )
-
-        encoder.close()
-
-    def test_encoder_threads_passed_to_thread(self, tmp_path):
-        """Test that encoder_threads is stored and passed through to encoder threads."""
-        encoder = StreamingVideoEncoder(
-            fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, encoder_threads=2
-        )
-        assert encoder.encoder_threads == 2
-
-        video_keys = [f"{OBS_IMAGES}.cam"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        # Verify the thread received the encoder_threads value
-        thread = encoder._threads[f"{OBS_IMAGES}.cam"]
-        assert thread.encoder_threads == 2
-
-        # Feed some frames and finish to ensure it works end-to-end
-        num_frames = 10
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-
-        results = encoder.finish_episode()
-        mp4_path, stats = results[f"{OBS_IMAGES}.cam"]
-        assert mp4_path.exists()
-        assert stats is not None
-
-        with av.open(str(mp4_path)) as container:
-            stream = container.streams.video[0]
-            total_frames = sum(1 for _ in container.decode(stream))
-        assert total_frames == num_frames
-
-        encoder.close()
-
-    def test_encoder_threads_none_by_default(self, tmp_path):
-        """Test that encoder_threads defaults to None (codec auto-detect)."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
-        assert encoder.encoder_threads is None
-        encoder.close()
-
-    def test_graceful_frame_dropping(self, tmp_path):
-        """Test that full queue drops frames instead of crashing."""
-        encoder = StreamingVideoEncoder(
-            fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13, queue_maxsize=1
-        )
-        video_keys = [f"{OBS_IMAGES}.cam"]
-        encoder.start_episode(video_keys, tmp_path)
-
-        # Feed many frames quickly - with queue_maxsize=1, some will be dropped
-        num_frames = 50
-        for _ in range(num_frames):
-            frame = np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8)
-            encoder.feed_frame(f"{OBS_IMAGES}.cam", frame)
-
-        # Should not raise - frames are dropped gracefully
-        results = encoder.finish_episode()
-        assert f"{OBS_IMAGES}.cam" in results
-
-        mp4_path, _ = results[f"{OBS_IMAGES}.cam"]
-        assert mp4_path.exists()
-
-        # Some frames should have been dropped (queue was tiny)
-        dropped = encoder._dropped_frames.get(f"{OBS_IMAGES}.cam", 0)
-        # We can't guarantee drops but can verify no crash occurred
-        assert dropped >= 0
-
-        encoder.close()
-
-
-# ─── Integration tests with LeRobotDataset ───
-
-
-class TestStreamingEncoderIntegration:
-    def test_add_frame_save_episode_streaming(self, tmp_path):
-        """Full integration test: add_frame -> save_episode with streaming encoding."""
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        features = {
-            "observation.images.cam": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "action": {"dtype": "float32", "shape": (6,), "names": ["j1", "j2", "j3", "j4", "j5", "j6"]},
-        }
-
-        dataset = LeRobotDataset.create(
-            repo_id="test/streaming",
-            fps=30,
-            features=features,
-            root=tmp_path / "streaming_test",
-            use_videos=True,
-            streaming_encoding=True,
-        )
-
-        assert dataset._streaming_encoder is not None
-
-        num_frames = 20
-        for _ in range(num_frames):
-            frame = {
-                "observation.images.cam": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "action": np.random.randn(6).astype(np.float32),
-                "task": "test task",
-            }
-            dataset.add_frame(frame)
-
-        dataset.save_episode()
-
-        # Verify dataset metadata
-        assert dataset.meta.total_episodes == 1
-        assert dataset.meta.total_frames == num_frames
-
-        # Verify stats exist for the video key
-        assert dataset.meta.stats is not None
-        assert "observation.images.cam" in dataset.meta.stats
-        assert "action" in dataset.meta.stats
-
-        dataset.finalize()
-
-    def test_streaming_disabled_creates_pngs(self, tmp_path):
-        """Test that disabling streaming encoding falls back to PNG path."""
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        features = {
-            "observation.images.cam": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "action": {"dtype": "float32", "shape": (6,), "names": ["j1", "j2", "j3", "j4", "j5", "j6"]},
-        }
-
-        dataset = LeRobotDataset.create(
-            repo_id="test/no_streaming",
-            fps=30,
-            features=features,
-            root=tmp_path / "no_streaming_test",
-            use_videos=True,
-            streaming_encoding=False,
-        )
-
-        assert dataset._streaming_encoder is None
-
-        num_frames = 5
-        for _ in range(num_frames):
-            frame = {
-                "observation.images.cam": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "action": np.random.randn(6).astype(np.float32),
-                "task": "test task",
-            }
-            dataset.add_frame(frame)
-
-        # With streaming disabled, PNG files should be written
-        images_dir = dataset.root / "images"
-        assert images_dir.exists()
-
-        dataset.save_episode()
-        dataset.finalize()
-
-    def test_multi_episode_streaming(self, tmp_path):
-        """Test recording multiple episodes with streaming encoding."""
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        features = {
-            "observation.images.cam": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "action": {"dtype": "float32", "shape": (2,), "names": ["j1", "j2"]},
-        }
-
-        dataset = LeRobotDataset.create(
-            repo_id="test/multi_ep",
-            fps=30,
-            features=features,
-            root=tmp_path / "multi_ep_test",
-            use_videos=True,
-            streaming_encoding=True,
-        )
-
-        for ep in range(3):
-            num_frames = 10 + ep * 5
-            for _ in range(num_frames):
-                frame = {
-                    "observation.images.cam": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                    "action": np.random.randn(2).astype(np.float32),
-                    "task": f"task_{ep}",
-                }
-                dataset.add_frame(frame)
-            dataset.save_episode()
-
-        assert dataset.meta.total_episodes == 3
-        assert dataset.meta.total_frames == 10 + 15 + 20
-
-        dataset.finalize()
-
-    def test_clear_episode_buffer_cancels_streaming(self, tmp_path):
-        """Test that clearing episode buffer cancels streaming encoding."""
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        features = {
-            "observation.images.cam": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "action": {"dtype": "float32", "shape": (2,), "names": ["j1", "j2"]},
-        }
-
-        dataset = LeRobotDataset.create(
-            repo_id="test/cancel",
-            fps=30,
-            features=features,
-            root=tmp_path / "cancel_test",
-            use_videos=True,
-            streaming_encoding=True,
-        )
-
-        # Add some frames
-        for _ in range(5):
-            frame = {
-                "observation.images.cam": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "action": np.random.randn(2).astype(np.float32),
-                "task": "task",
-            }
-            dataset.add_frame(frame)
-
-        # Cancel and re-record
-        dataset.clear_episode_buffer()
-
-        # Record a new episode
-        for _ in range(10):
-            frame = {
-                "observation.images.cam": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "action": np.random.randn(2).astype(np.float32),
-                "task": "task",
-            }
-            dataset.add_frame(frame)
-        dataset.save_episode()
-
-        assert dataset.meta.total_episodes == 1
-        assert dataset.meta.total_frames == 10
-
-        dataset.finalize()
-
-    def test_multi_camera_add_frame_streaming(self, tmp_path):
-        """Test that start_episode is called once with multiple video keys."""
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        features = {
-            "observation.images.cam1": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "observation.images.cam2": {
-                "dtype": "video",
-                "shape": (64, 96, 3),
-                "names": ["height", "width", "channels"],
-            },
-            "action": {"dtype": "float32", "shape": (2,), "names": ["j1", "j2"]},
-        }
-
-        dataset = LeRobotDataset.create(
-            repo_id="test/multi_cam",
-            fps=30,
-            features=features,
-            root=tmp_path / "multi_cam_test",
-            use_videos=True,
-            streaming_encoding=True,
-        )
-
-        num_frames = 15
-        for _ in range(num_frames):
-            frame = {
-                "observation.images.cam1": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "observation.images.cam2": np.random.randint(0, 255, (64, 96, 3), dtype=np.uint8),
-                "action": np.random.randn(2).astype(np.float32),
-                "task": "test task",
-            }
-            dataset.add_frame(frame)
-
-        dataset.save_episode()
-
-        assert dataset.meta.total_episodes == 1
-        assert dataset.meta.total_frames == num_frames
-
-        dataset.finalize()
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import pytest
 import torch

 from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
@@ -38,9 +37,6 @@ def test_classifier_output():


@require_package("transformers")
-@pytest.mark.skip(
-    reason="helper2424/resnet10 needs to be updated to work with the latest version of transformers"
-)
 def test_binary_classifier_with_default_params():
    from lerobot.policies.sac.reward_model.modeling_classifier import Classifier

@@ -82,9 +78,6 @@ def test_binary_classifier_with_default_params():


@require_package("transformers")
-@pytest.mark.skip(
-    reason="helper2424/resnet10 needs to be updated to work with the latest version of transformers"
-)
 def test_multiclass_classifier():
    from lerobot.policies.sac.reward_model.modeling_classifier import Classifier

@@ -124,9 +117,6 @@ def test_multiclass_classifier():


@require_package("transformers")
-@pytest.mark.skip(
-    reason="helper2424/resnet10 needs to be updated to work with the latest version of transformers"
-)
 def test_default_device():
    from lerobot.policies.sac.reward_model.modeling_classifier import Classifier

@@ -139,9 +129,6 @@ def test_default_device():


@require_package("transformers")
-@pytest.mark.skip(
-    reason="helper2424/resnet10 needs to be updated to work with the latest version of transformers"
-)
 def test_explicit_device_setup():
    from lerobot.policies.sac.reward_model.modeling_classifier import Classifier

@@ -17,6 +17,7 @@
 """Test script to verify PI0Fast policy integration with LeRobot vs the original implementation"""
 # ruff: noqa: E402

+import os
 import random
 from copy import deepcopy
 from typing import Any
@@ -27,6 +28,10 @@ import torch

 pytest.importorskip("transformers")
 pytest.importorskip("scipy")
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires accepting the model license",
+)

 from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig
 from lerobot.policies.pi0_fast.modeling_pi0_fast import PI0FastPolicy
@@ -49,19 +54,19 @@ IMAGE_HEIGHT = 224
 IMAGE_WIDTH = 224
 NUM_VIEWS = 2  # Number of camera views
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_PATH_LEROBOT = "jadechoghari/pi0fast-base"
+MODEL_PATH_LEROBOT = "lerobot/pi0fast-base"

 # Expected action token shape: (batch_size, max_decoding_steps)
 EXPECTED_ACTION_TOKENS_SHAPE = (1, 2)

 # Expected first 5 action tokens (for reproducibility check)
-EXPECTED_ACTION_TOKENS_FIRST_5 = torch.tensor([255657, 255425])
+EXPECTED_ACTION_TOKENS_FIRST_5 = torch.tensor([255657, 255362])

 # Expected actions after detokenization
 EXPECTED_ACTIONS_SHAPE = (1, 2, 32)  # (batch_size, n_action_steps, action_dim)
-EXPECTED_ACTIONS_MEAN = 0.046403881162405014
-EXPECTED_ACTIONS_STD = 0.2607129216194153
-EXPECTED_ACTIONS_FIRST_5 = torch.tensor([-0.0707, 1.4849, 0.0000, 0.0000, 0.0000])
+EXPECTED_ACTIONS_MEAN = 0.04419417306780815
+EXPECTED_ACTIONS_STD = 0.26231569051742554
+EXPECTED_ACTIONS_FIRST_5 = torch.tensor([0.0000, 1.4849, 0.0000, 0.0000, 0.0000])


 def set_seed_all(seed: int):
@@ -16,8 +16,17 @@

 """Test script to verify PI0 policy integration with LeRobot, only meant to be run locally!"""

+import os
+
+import pytest
 import torch

+# Skip this entire module in CI
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires local OpenPI installation and is not meant for CI",
+)
+
 from lerobot.policies.factory import make_policy_config  # noqa: E402
 from lerobot.policies.pi0 import (  # noqa: E402
    PI0Config,
@@ -16,15 +16,25 @@

 """Test script to verify PI0.5 (pi05) support in PI0 policy, only meant to be run locally!"""

+import os
+
+import pytest
 import torch

+from lerobot.utils.random_utils import set_seed
+
+# Skip this entire module in CI
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires local OpenPI installation and is not meant for CI",
+)
+
 from lerobot.policies.factory import make_policy_config  # noqa: E402
 from lerobot.policies.pi05 import (  # noqa: E402
    PI05Config,
    PI05Policy,
    make_pi05_pre_post_processors,  # noqa: E402
 )
-from lerobot.utils.random_utils import set_seed
 from tests.utils import require_cuda  # noqa: E402


@@ -24,10 +24,9 @@ import torch
 # Skip this entire module in CI
 pytestmark = pytest.mark.skipif(
    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
-    reason="TODO: This test seems to hang the CI",
+    reason="This test requires local OpenPI installation and is not meant for CI",
 )

-
 from lerobot.configs.types import FeatureType, PolicyFeature, RTCAttentionSchedule  # noqa: E402
 from lerobot.policies.pi05 import PI05Config, PI05Policy, make_pi05_pre_post_processors  # noqa: E402
 from lerobot.policies.rtc.configuration_rtc import RTCConfig  # noqa: E402
@@ -24,10 +24,9 @@ import torch
 # Skip this entire module in CI
 pytestmark = pytest.mark.skipif(
    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
-    reason="TODO: This test seems to hang the CI",
+    reason="This test requires local OpenPI installation and is not meant for CI",
 )

-
 from lerobot.configs.types import FeatureType, PolicyFeature, RTCAttentionSchedule  # noqa: E402
 from lerobot.policies.pi0 import PI0Config, PI0Policy, make_pi0_pre_post_processors  # noqa: E402
 from lerobot.policies.rtc.configuration_rtc import RTCConfig  # noqa: E402
@@ -305,9 +305,6 @@ def test_sac_policy_with_visual_input(batch_size: int, state_dim: int, action_di
    [(1, 6, 6, "helper2424/resnet10"), (1, 6, 6, "facebook/convnext-base-224")],
 )
@pytest.mark.skipif(not TRANSFORMERS_AVAILABLE, reason="Transformers are not installed")
-@pytest.mark.skip(
-    reason="helper2424/resnet10 needs to be updated to work with the latest version of transformers"
-)
 def test_sac_policy_with_pretrained_encoder(
    batch_size: int, state_dim: int, action_dim: int, vision_encoder_name: str
 ):
@@ -16,6 +16,8 @@

 """Test script to verify Wall-X policy integration with LeRobot, only meant to be run locally!"""

+import os
+
 import pytest
 import torch

@@ -24,6 +26,12 @@ pytest.importorskip("peft")
 pytest.importorskip("transformers")
 pytest.importorskip("torchdiffeq")

+# Skip this entire module in CI
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires local Wall-X installation and is not meant for CI",
+)
+
 from lerobot.policies.factory import make_policy_config  # noqa: E402
 from lerobot.policies.wall_x import WallXConfig  # noqa: E402
 from lerobot.policies.wall_x.modeling_wall_x import WallXPolicy  # noqa: E402