From 788dde3a34722f0bb4cb3f8e7da63930eb3bffd7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:56:44 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 README.md                               |  4 ++--
 docs/source/porting_datasets_v3.mdx     | 18 ++++++++++++++++--
 src/lerobot/datasets/lerobot_dataset.py |  2 +-
 src/lerobot/datasets/sampler.py         |  5 +++--
 src/lerobot/datasets/video_utils.py     |  4 ++--
 5 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c38426d4f..ac55d6ecf 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ Under the hood, the `LeRobotDataset` format makes use of several ways to seriali
 
 Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
 
-```
+````
 dataset attributes:
   ├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
   │  ├ observation.images.cam_high (VideoFrame):
@@ -278,7 +278,7 @@ python -m lerobot.scripts.eval \
     --eval.n_episodes=10 \
     --policy.use_amp=false \
     --policy.device=cuda
-```
+````
 
 Note: After training your own policy, you can re-evaluate the checkpoints with:
 
diff --git a/docs/source/porting_datasets_v3.mdx b/docs/source/porting_datasets_v3.mdx
index 593de0bae..36e5f93cb 100644
--- a/docs/source/porting_datasets_v3.mdx
+++ b/docs/source/porting_datasets_v3.mdx
@@ -7,6 +7,7 @@ This tutorial explains how to port large-scale robotic datasets to the LeRobot D
 Dataset v3.0 fundamentally changes how data is organized and stored:
 
 **v2.1 Structure (Episode-based)**:
+
 ```
 dataset/
 ├── data/chunk-000/episode_000000.parquet
@@ -16,6 +17,7 @@ dataset/
 ```
 
 **v3.0 Structure (File-based)**:
+
 ```
 dataset/
 ├── data/chunk-000/file-000.parquet        # Multiple episodes per file
@@ -30,16 +32,19 @@ This transition from individual episode files to file-based chunks dramatically
 Dataset v3.0 introduces significant improvements for handling large datasets:
 
 ### 🏗️ **Enhanced File Organization**
+
 - **File-based structure**: Episodes are now grouped into chunked files rather than individual episode files
 - **Configurable file sizes**: for data and video files
 - **Improved storage efficiency**: Better compression and reduced overhead
 
 ### 📊 **Modern Metadata Management**
+
 - **Parquet-based metadata**: Replaced JSON Lines with efficient parquet format
 - **Structured episode access**: Direct pandas DataFrame access via `dataset.meta.episodes`
 - **Per-episode statistics**: Enhanced statistics tracking at episode level
 
 ### 🚀 **Performance Enhancements**
+
 - **Memory-mapped access**: Improved RAM usage through PyArrow memory mapping
 - **Faster loading**: Significantly reduced dataset initialization time
 - **Better scalability**: Designed for datasets with millions of episodes
@@ -56,6 +61,7 @@ Before porting large datasets, ensure you have:
 ## Understanding the DROID Dataset
 
 [DROID 1.0.1](https://droid-dataset.github.io/droid/the-droid-dataset) is an excellent example of a large-scale robotic dataset:
+
 - **Size**: 1.7TB (RLDS format), 8.7TB (raw data)
 - **Structure**: 2048 pre-defined TensorFlow dataset shards
 - **Content**: 76,000+ robot manipulation trajectories from Franka Emika Panda robots
@@ -64,6 +70,7 @@ Before porting large datasets, ensure you have:
 - **Hosting**: Google Cloud Storage with public access via `gsutil`
 
 The dataset contains diverse manipulation demonstrations with:
+
 - Multiple camera views (wrist camera, exterior cameras)
 - Natural language task descriptions
 - Robot proprioceptive state and actions
@@ -109,6 +116,7 @@ DROID_FEATURES = {
 ### Step 1: Install Dependencies
 
 For DROID specifically:
+
 ```bash
 pip install tensorflow
 pip install tensorflow_datasets
@@ -133,6 +141,7 @@ gsutil -m cp -r gs://gresearch/robotics/droid_100 /your/data/
 
 > [!WARNING]
 > Large datasets require substantial time and storage:
+>
 > - **Full DROID (1.7TB)**: Several days to download depending on bandwidth
 > - **Processing time**: 7+ days for local porting of full dataset
 > - **Upload time**: 3+ days to push to Hugging Face Hub
@@ -150,6 +159,7 @@ python examples/port_datasets/droid_rlds/port_droid.py \
 ### Development and Testing
 
 For development, you can port a single shard:
+
 ```bash
 python examples/port_datasets/droid_rlds/port_droid.py \
     --raw-dir /your/data/droid/1.0.1 \
@@ -173,6 +183,7 @@ pip install datatrove  # Hugging Face's distributed processing library
 ### Step 2: Configure Your SLURM Environment
 
 Find your partition information:
+
 ```bash
 sinfo --format="%R"  # List available partitions
 sinfo -N -p your_partition -h -o "%N cpus=%c mem=%m"  # Check resources
@@ -206,21 +217,25 @@ python examples/port_datasets/droid_rlds/slurm_port_shards.py \
 ### Step 4: Monitor Progress
 
 Check running jobs:
+
 ```bash
 squeue -u $USER
 ```
 
 Monitor overall progress:
+
 ```bash
 jobs_status /your/logs
 ```
 
 Inspect individual job logs:
+
 ```bash
 less /your/logs/port_droid/slurm_jobs/JOB_ID_WORKER_ID.out
 ```
 
 Debug failed jobs:
+
 ```bash
 failed_logs /your/logs/port_droid
 ```
@@ -280,8 +295,6 @@ dataset/
 
 This replaces the old episode-per-file structure with efficient, optimally-sized chunks.
 
-
-
 ## Migrating from Dataset v2.1
 
 If you have existing datasets in v2.1 format, use the migration tool:
@@ -292,6 +305,7 @@ python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
 ```
 
 This automatically:
+
 - Converts file structure to v3.0 format
 - Migrates metadata from JSON Lines to parquet
 - Aggregates statistics and creates per-episode stats
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 7d578e77b..bc1b4240d 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -18,7 +18,7 @@ import logging
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Callable
+from collections.abc import Callable
 
 import datasets
 import numpy as np
diff --git a/src/lerobot/datasets/sampler.py b/src/lerobot/datasets/sampler.py
index 02fdc63de..b2c311bec 100644
--- a/src/lerobot/datasets/sampler.py
+++ b/src/lerobot/datasets/sampler.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterator, Union
+from typing import Union
+from collections.abc import Iterator
 
 import torch
 
@@ -23,7 +24,7 @@ class EpisodeAwareSampler:
         self,
         dataset_from_indices: list[int],
         dataset_to_indices: list[int],
-        episode_indices_to_use: Union[list, None] = None,
+        episode_indices_to_use: list | None = None,
         drop_n_first_frames: int = 0,
         drop_n_last_frames: int = 0,
         shuffle: bool = False,
diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py
index b0f6c15c2..59d45071b 100644
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -345,7 +345,7 @@ def get_audio_info(video_path: Path | str) -> dict:
         "json",
         str(video_path),
     ]
-    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    result = subprocess.run(ffprobe_audio_cmd, capture_output=True, text=True)
     if result.returncode != 0:
         raise RuntimeError(f"Error running ffprobe: {result.stderr}")
 
@@ -381,7 +381,7 @@ def get_video_info(video_path: Path | str) -> dict:
         "json",
         str(video_path),
     ]
-    result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    result = subprocess.run(ffprobe_video_cmd, capture_output=True, text=True)
     if result.returncode != 0:
         raise RuntimeError(f"Error running ffprobe: {result.stderr}")