From 3483e4441e3dbb8743512a3ab501f465346be501 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 15 Jul 2025 21:38:18 +0200
Subject: [PATCH] Removed examples from import path in `port_datasets` removed
 readme from droid examples and add a tutorial in docs

---
 docs/source/_toctree.yml                      |   2 +
 docs/source/porting_datasets_v3.mdx           | 307 ++++++++++++++++++
 .../agibot_hdf5/slurm_port_shards.py          |  11 +-
 examples/port_datasets/droid_rlds/README.md   | 144 --------
 .../droid_rlds/slurm_aggregate_shards.py      |   2 +-
 .../droid_rlds/slurm_port_shards.py           |   5 +-
 .../port_datasets/droid_rlds/slurm_upload.py  |   2 +-
 7 files changed, 318 insertions(+), 155 deletions(-)
 create mode 100644 docs/source/porting_datasets_v3.mdx
 delete mode 100644 examples/port_datasets/droid_rlds/README.md

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 83777a3c8..024418834 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
     title: Train a Robot with RL
   - local: hilserl_sim
     title: Train RL in Simulation
+  - local: porting_datasets_v3
+    title: Porting Large Datasets
   title: "Tutorials"
 - sections:
   - local: smolvla
diff --git a/docs/source/porting_datasets_v3.mdx b/docs/source/porting_datasets_v3.mdx
new file mode 100644
index 000000000..593de0bae
--- /dev/null
+++ b/docs/source/porting_datasets_v3.mdx
@@ -0,0 +1,307 @@
+# Porting Large Datasets to LeRobot Dataset v3.0
+
+This tutorial explains how to port large-scale robotic datasets to the LeRobot Dataset v3.0 format. We'll use the **DROID 1.0.1** dataset as our primary example, which demonstrates handling multi-terabyte datasets with thousands of shards across SLURM clusters.
+
+## File Organization: v2.1 vs v3.0
+
+Dataset v3.0 fundamentally changes how data is organized and stored:
+
+**v2.1 Structure (Episode-based)**:
+```
+dataset/
+├── data/chunk-000/episode_000000.parquet
+├── data/chunk-000/episode_000001.parquet
+├── videos/chunk-000/camera/episode_000000.mp4
+└── meta/episodes.jsonl
+```
+
+**v3.0 Structure (File-based)**:
+```
+dataset/
+├── data/chunk-000/file-000.parquet        # Multiple episodes per file
+├── videos/camera/chunk-000/file-000.mp4   # Consolidated video chunks
+└── meta/episodes/chunk-000/file-000.parquet  # Structured metadata
+```
+
+This transition from individual episode files to file-based chunks dramatically improves performance and reduces storage overhead.
+
+## What's New in Dataset v3.0
+
+Dataset v3.0 introduces significant improvements for handling large datasets:
+
+### 🏗️ **Enhanced File Organization**
+- **File-based structure**: Episodes are now grouped into chunked files rather than individual episode files
+- **Configurable file sizes**: for data and video files
+- **Improved storage efficiency**: Better compression and reduced overhead
+
+### 📊 **Modern Metadata Management**
+- **Parquet-based metadata**: Replaced JSON Lines with efficient parquet format
+- **Structured episode access**: Direct pandas DataFrame access via `dataset.meta.episodes`
+- **Per-episode statistics**: Enhanced statistics tracking at episode level
+
+### 🚀 **Performance Enhancements**
+- **Memory-mapped access**: Improved RAM usage through PyArrow memory mapping
+- **Faster loading**: Significantly reduced dataset initialization time
+- **Better scalability**: Designed for datasets with millions of episodes
+
+## Prerequisites
+
+Before porting large datasets, ensure you have:
+
+- **LeRobot installed** with v3.0 support. Follow our [Installation Guide](./installation).
+- **Sufficient storage**: Raw datasets can be very large (e.g., DROID requires 2TB)
+- **Cluster access** (recommended for large datasets): SLURM or similar job scheduler
+- **Dataset-specific dependencies**: For DROID, you'll need TensorFlow Dataset utilities
+
+## Understanding the DROID Dataset
+
+[DROID 1.0.1](https://droid-dataset.github.io/droid/the-droid-dataset) is an excellent example of a large-scale robotic dataset:
+- **Size**: 1.7TB (RLDS format), 8.7TB (raw data)
+- **Structure**: 2048 pre-defined TensorFlow dataset shards
+- **Content**: 76,000+ robot manipulation trajectories from Franka Emika Panda robots
+- **Scope**: Real-world manipulation tasks across multiple environments and objects
+- **Format**: Originally in TensorFlow Records/RLDS format, requiring conversion to LeRobot format
+- **Hosting**: Google Cloud Storage with public access via `gsutil`
+
+The dataset contains diverse manipulation demonstrations with:
+- Multiple camera views (wrist camera, exterior cameras)
+- Natural language task descriptions
+- Robot proprioceptive state and actions
+- Success/failure annotations
+
+### DROID Features Schema
+
+```python
+DROID_FEATURES = {
+    # Episode markers
+    "is_first": {"dtype": "bool", "shape": (1,)},
+    "is_last": {"dtype": "bool", "shape": (1,)},
+    "is_terminal": {"dtype": "bool", "shape": (1,)},
+
+    # Language instructions
+    "language_instruction": {"dtype": "string", "shape": (1,)},
+    "language_instruction_2": {"dtype": "string", "shape": (1,)},
+    "language_instruction_3": {"dtype": "string", "shape": (1,)},
+
+    # Robot state
+    "observation.state.gripper_position": {"dtype": "float32", "shape": (1,)},
+    "observation.state.cartesian_position": {"dtype": "float32", "shape": (6,)},
+    "observation.state.joint_position": {"dtype": "float32", "shape": (7,)},
+
+    # Camera observations
+    "observation.images.wrist_left": {"dtype": "image"},
+    "observation.images.exterior_1_left": {"dtype": "image"},
+    "observation.images.exterior_2_left": {"dtype": "image"},
+
+    # Actions
+    "action.gripper_position": {"dtype": "float32", "shape": (1,)},
+    "action.cartesian_position": {"dtype": "float32", "shape": (6,)},
+    "action.joint_position": {"dtype": "float32", "shape": (7,)},
+
+    # Standard LeRobot format
+    "observation.state": {"dtype": "float32", "shape": (8,)},  # joints + gripper
+    "action": {"dtype": "float32", "shape": (8,)},  # joints + gripper
+}
+```
+
+## Approach 1: Single Computer Porting
+
+### Step 1: Install Dependencies
+
+For DROID specifically:
+```bash
+pip install tensorflow
+pip install tensorflow_datasets
+```
+
+For other datasets, install the appropriate readers for your source format.
+
+### Step 2: Download Raw Data
+
+Download DROID from Google Cloud Storage using `gsutil`:
+
+```bash
+# Install Google Cloud SDK if not already installed
+# https://cloud.google.com/sdk/docs/install
+
+# Download the full RLDS dataset (1.7TB)
+gsutil -m cp -r gs://gresearch/robotics/droid/1.0.1 /your/data/
+
+# Or download just the 100-episode sample (2GB) for testing
+gsutil -m cp -r gs://gresearch/robotics/droid_100 /your/data/
+```
+
+> [!WARNING]
+> Large datasets require substantial time and storage:
+> - **Full DROID (1.7TB)**: Several days to download depending on bandwidth
+> - **Processing time**: 7+ days for local porting of full dataset
+> - **Upload time**: 3+ days to push to Hugging Face Hub
+> - **Local storage**: ~400GB for processed LeRobot format
+
+### Step 3: Port the Dataset
+
+```bash
+python examples/port_datasets/droid_rlds/port_droid.py \
+    --raw-dir /your/data/droid/1.0.1 \
+    --repo-id your_id/droid_1.0.1 \
+    --push-to-hub
+```
+
+### Development and Testing
+
+For development, you can port a single shard:
+```bash
+python examples/port_datasets/droid_rlds/port_droid.py \
+    --raw-dir /your/data/droid/1.0.1 \
+    --repo-id your_id/droid_1.0.1_test \
+    --num-shards 2048 \
+    --shard-index 0
+```
+
+This approach works for smaller datasets or testing, but large datasets require cluster computing.
+
+## Approach 2: SLURM Cluster Porting (Recommended)
+
+For large datasets like DROID, parallel processing across multiple nodes dramatically reduces processing time.
+
+### Step 1: Install Cluster Dependencies
+
+```bash
+pip install datatrove  # Hugging Face's distributed processing library
+```
+
+### Step 2: Configure Your SLURM Environment
+
+Find your partition information:
+```bash
+sinfo --format="%R"  # List available partitions
+sinfo -N -p your_partition -h -o "%N cpus=%c mem=%m"  # Check resources
+```
+
+Choose a **CPU partition** - no GPU needed for dataset porting.
+
+### Step 3: Launch Parallel Porting Jobs
+
+```bash
+python examples/port_datasets/droid_rlds/slurm_port_shards.py \
+    --raw-dir /your/data/droid/1.0.1 \
+    --repo-id your_id/droid_1.0.1 \
+    --logs-dir /your/logs \
+    --job-name port_droid \
+    --partition your_partition \
+    --workers 2048 \
+    --cpus-per-task 8 \
+    --mem-per-cpu 1950M
+```
+
+#### Parameter Guidelines
+
+- **`--workers`**: Number of parallel jobs (max 2048 for DROID's shard count)
+- **`--cpus-per-task`**: 8 CPUs recommended for frame encoding parallelization
+- **`--mem-per-cpu`**: ~16GB total RAM (8×1950M) for loading raw frames
+
+> [!TIP]
+> Start with fewer workers (e.g., 100) to test your cluster configuration before launching thousands of jobs.
+
+### Step 4: Monitor Progress
+
+Check running jobs:
+```bash
+squeue -u $USER
+```
+
+Monitor overall progress:
+```bash
+jobs_status /your/logs
+```
+
+Inspect individual job logs:
+```bash
+less /your/logs/port_droid/slurm_jobs/JOB_ID_WORKER_ID.out
+```
+
+Debug failed jobs:
+```bash
+failed_logs /your/logs/port_droid
+```
+
+### Step 5: Aggregate Shards
+
+Once all porting jobs complete:
+
+```bash
+python examples/port_datasets/droid_rlds/slurm_aggregate_shards.py \
+    --repo-id your_id/droid_1.0.1 \
+    --logs-dir /your/logs \
+    --job-name aggr_droid \
+    --partition your_partition \
+    --workers 2048 \
+    --cpus-per-task 8 \
+    --mem-per-cpu 1950M
+```
+
+### Step 6: Upload to Hub
+
+```bash
+python examples/port_datasets/droid_rlds/slurm_upload.py \
+    --repo-id your_id/droid_1.0.1 \
+    --logs-dir /your/logs \
+    --job-name upload_droid \
+    --partition your_partition \
+    --workers 50 \
+    --cpus-per-task 4 \
+    --mem-per-cpu 1950M
+```
+
+> [!NOTE]
+> Upload uses fewer workers (50) since it's network-bound rather than compute-bound.
+
+## Dataset v3.0 File Structure
+
+Your completed dataset will have this modern structure:
+
+```
+dataset/
+├── meta/
+│   ├── episodes/
+│   │   └── chunk-000/
+│   │       └── file-000.parquet    # Episode metadata
+│   ├── tasks.parquet               # Task definitions
+│   ├── stats.json                  # Aggregated statistics
+│   └── info.json                   # Dataset information
+├── data/
+│   └── chunk-000/
+│       └── file-000.parquet        # Consolidated episode data
+└── videos/
+    └── camera_key/
+        └── chunk-000/
+            └── file-000.mp4        # Consolidated video files
+```
+
+This replaces the old episode-per-file structure with efficient, optimally-sized chunks.
+
+
+
+## Migrating from Dataset v2.1
+
+If you have existing datasets in v2.1 format, use the migration tool:
+
+```bash
+python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
+    --repo-id your_id/existing_dataset
+```
+
+This automatically:
+- Converts file structure to v3.0 format
+- Migrates metadata from JSON Lines to parquet
+- Aggregates statistics and creates per-episode stats
+- Updates version information
+
+## Performance Benefits
+
+Dataset v3.0 provides significant improvements for large datasets:
+
+- **Faster loading**: 3-5x reduction in initialization time
+- **Memory efficiency**: Better RAM usage through memory mapping
+- **Scalable processing**: Handles millions of episodes efficiently
+- **Storage optimization**: Reduced file count and improved compression
diff --git a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
index 9c2587e5f..dbd10a7ac 100644
--- a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
+++ b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
@@ -22,8 +22,7 @@ from pathlib import Path
 from datatrove.executor import LocalPipelineExecutor
 from datatrove.executor.slurm import SlurmPipelineExecutor
 from datatrove.pipeline.base import PipelineStep
-
-from examples.port_datasets.agibot_hdf5.download import (
+from port_datasets.agibot_hdf5.download import (
     RAW_REPO_ID,
     download_meta_data,
     get_observations_files,
@@ -44,15 +43,15 @@ class PortAgiBotShards(PipelineStep):
         import shutil
 
         from datasets.utils.tqdm import disable_progress_bars
-
-        from examples.port_datasets.agibot_hdf5.download import (
+        from port_datasets.agibot_hdf5.download import (
             RAW_REPO_ID,
             download,
             get_observations_files,
             no_depth,
         )
-        from examples.port_datasets.agibot_hdf5.port_agibot import port_agibot
-        from examples.port_datasets.droid_rlds.port_droid import validate_dataset
+        from port_datasets.agibot_hdf5.port_agibot import port_agibot
+        from port_datasets.droid_rlds.port_droid import validate_dataset
+
         from lerobot.constants import HF_LEROBOT_HOME
         from lerobot.utils.utils import init_logging
 
diff --git a/examples/port_datasets/droid_rlds/README.md b/examples/port_datasets/droid_rlds/README.md
deleted file mode 100644
index 9cbb8969f..000000000
--- a/examples/port_datasets/droid_rlds/README.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# Port DROID 1.0.1 dataset to LeRobotDataset
-
-## Download
-
-TODO
-
-It will take 2 TB in your local disk.
-
-## Port on a single computer
-
-First, install tensorflow dataset utilities to read from raw files:
-```bash
-pip install tensorflow
-pip install tensorflow_datasets
-```
-
-Then run this script to start porting the dataset:
-```bash
-python examples/port_datasets/droid_rlds/port_droid.py \
-    --raw-dir /your/data/droid/1.0.1 \
-    --repo-id your_id/droid_1.0.1 \
-    --push-to-hub
-```
-
-It will take 400GB in your local disk.
-
-As usual, your LeRobotDataset will be stored in your huggingface/lerobot cache folder.
-
-WARNING: it will take 7 days for porting the dataset locally and 3 days to upload, so we will need to parallelize over multiple nodes on a slurm cluster.
-
-NOTE: For development, run this script to start porting a shard:
-```bash
-python examples/port_datasets/droid_rlds/port.py \
-    --raw-dir /your/data/droid/1.0.1 \
-    --repo-id your_id/droid_1.0.1 \
-    --num-shards 2048 \
-    --shard-index 0
-```
-
-## Port over SLURM
-
-Install slurm utilities from Hugging Face:
-```bash
-pip install datatrove
-```
-
-
-### 1. Port one shard per job
-
-Run this script to start porting shards of the dataset:
-```bash
-python examples/port_datasets/droid_rlds/slurm_port_shards.py \
-    --raw-dir /your/data/droid/1.0.1 \
-    --repo-id your_id/droid_1.0.1 \
-    --logs-dir /your/logs \
-    --job-name port_droid \
-    --partition your_partition \
-    --workers 2048 \
-    --cpus-per-task 8 \
-    --mem-per-cpu 1950M
-```
-
-**Note on how to set your command line arguments**
-
-Regarding `--partition`, find yours by running:
-```bash
-info --format="%R"`
-```
-and select the CPU partition if you have one. No GPU needed.
-
-Regarding `--workers`, it is the number of slurm jobs you will launch in parallel. 2048 is the maximum number, since there is 2048 shards in Droid. This big number will certainly max-out your cluster.
-
-Regarding `--cpus-per-task` and `--mem-per-cpu`, by default it will use ~16GB of RAM (8*1950M) which is recommended to load the raw frames and 8 CPUs which can be useful to parallelize the encoding of the frames.
-
-Find the number of CPUs and Memory of the nodes of your partition by running:
-```bash
-sinfo -N -p your_partition -h -o "%N cpus=%c mem=%m"
-```
-
-**Useful commands to check progress and debug**
-
-Check if your jobs are running:
-```bash
-squeue -u $USER`
-```
-
-You should see a list with job indices like `15125385_155` where `15125385` is the index of the run and `155` is the worker index. The output/print of this worker is written in real time in `/your/logs/job_name/slurm_jobs/15125385_155.out`. For instance, you can inspect the content of this file by running `less /your/logs/job_name/slurm_jobs/15125385_155.out`.
-
-Check the progression of your jobs by running:
-```bash
-jobs_status /your/logs
-```
-
-If it's not 100% and no more slurm job is running, it means that some of them failed. Inspect the logs by running:
-```bash
-failed_logs /your/logs/job_name
-```
-
-If there is an issue in the code, you can fix it in debug mode with `--slurm 0` which allows to set breakpoint:
-```bash
-python examples/port_datasets/droid_rlds/slurm_port_shards.py --slurm 0 ...
-```
-
-And you can relaunch the same command, which will skip the completed jobs:
-```bash
-python examples/port_datasets/droid_rlds/slurm_port_shards.py --slurm 1 ...
-```
-
-Once all jobs are completed, you will have one dataset per shard (e.g. `droid_1.0.1_world_2048_rank_1594`) saved on disk in your `/lerobot/home/dir/your_id` directory. You can find your `/lerobot/home/dir` by running:
-```bash
-python -c "from lerobot.constants import HF_LEROBOT_HOME;print(HF_LEROBOT_HOME)"
-```
-
-
-### 2. Aggregate all shards
-
-Run this script to start aggregation:
-```bash
-python examples/port_datasets/droid_rlds/slurm_aggregate_shards.py \
-    --repo-id your_id/droid_1.0.1 \
-    --logs-dir /your/logs \
-    --job-name aggr_droid \
-    --partition your_partition \
-    --workers 2048 \
-    --cpus-per-task 8 \
-    --mem-per-cpu 1950M
-```
-
-Once all jobs are completed, you will have one dataset your `/lerobot/home/dir/your_id/droid_1.0.1` directory.
-
-
-### 3. Upload dataset
-
-Run this script to start uploading:
-```bash
-python examples/port_datasets/droid_rlds/slurm_upload.py \
-    --repo-id your_id/droid_1.0.1 \
-    --logs-dir /your/logs \
-    --job-name upload_droid \
-    --partition your_partition \
-    --workers 50 \
-    --cpus-per-task 4 \
-    --mem-per-cpu 1950M
-```
diff --git a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
index 9d026be35..4e1b71a31 100644
--- a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
+++ b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
@@ -21,8 +21,8 @@ from pathlib import Path
 from datatrove.executor import LocalPipelineExecutor
 from datatrove.executor.slurm import SlurmPipelineExecutor
 from datatrove.pipeline.base import PipelineStep
+from port_datasets.droid_rlds.port_droid import DROID_SHARDS
 
-from examples.port_datasets.droid_rlds.port_droid import DROID_SHARDS
 from lerobot.datasets.aggregate import aggregate_datasets
 from lerobot.utils.utils import init_logging
 
diff --git a/examples/port_datasets/droid_rlds/slurm_port_shards.py b/examples/port_datasets/droid_rlds/slurm_port_shards.py
index c29d8e94e..3bb4c135c 100644
--- a/examples/port_datasets/droid_rlds/slurm_port_shards.py
+++ b/examples/port_datasets/droid_rlds/slurm_port_shards.py
@@ -20,8 +20,7 @@ from pathlib import Path
 from datatrove.executor import LocalPipelineExecutor
 from datatrove.executor.slurm import SlurmPipelineExecutor
 from datatrove.pipeline.base import PipelineStep
-
-from examples.port_datasets.droid_rlds.port_droid import DROID_SHARDS
+from port_datasets.droid_rlds.port_droid import DROID_SHARDS
 
 
 class PortDroidShards(PipelineStep):
@@ -36,8 +35,8 @@ class PortDroidShards(PipelineStep):
 
     def run(self, data=None, rank: int = 0, world_size: int = 1):
         from datasets.utils.tqdm import disable_progress_bars
+        from port_datasets.droid_rlds.port_droid import port_droid, validate_dataset
 
-        from examples.port_datasets.droid_rlds.port_droid import port_droid, validate_dataset
         from lerobot.utils.utils import init_logging
 
         init_logging()
diff --git a/examples/port_datasets/droid_rlds/slurm_upload.py b/examples/port_datasets/droid_rlds/slurm_upload.py
index 91d1fc628..c9d227126 100644
--- a/examples/port_datasets/droid_rlds/slurm_upload.py
+++ b/examples/port_datasets/droid_rlds/slurm_upload.py
@@ -24,8 +24,8 @@ from datatrove.executor.slurm import SlurmPipelineExecutor
 from datatrove.pipeline.base import PipelineStep
 from huggingface_hub import HfApi
 from huggingface_hub.constants import REPOCARD_NAME
+from port_datasets.droid_rlds.port_droid import DROID_SHARDS
 
-from examples.port_datasets.droid_rlds.port_droid import DROID_SHARDS
 from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDatasetMetadata
 from lerobot.datasets.utils import create_lerobot_dataset_card
 from lerobot.utils.utils import init_logging