From 788dde3a34722f0bb4cb3f8e7da63930eb3bffd7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Jul 2025 13:56:44 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 4 ++-- docs/source/porting_datasets_v3.mdx | 18 ++++++++++++++++-- src/lerobot/datasets/lerobot_dataset.py | 2 +- src/lerobot/datasets/sampler.py | 5 +++-- src/lerobot/datasets/video_utils.py | 4 ++-- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c38426d4f..ac55d6ecf 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ Under the hood, the `LeRobotDataset` format makes use of several ways to seriali Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects: -``` +```` dataset attributes: ├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example: │ ├ observation.images.cam_high (VideoFrame): @@ -278,7 +278,7 @@ python -m lerobot.scripts.eval \ --eval.n_episodes=10 \ --policy.use_amp=false \ --policy.device=cuda -``` +```` Note: After training your own policy, you can re-evaluate the checkpoints with: diff --git a/docs/source/porting_datasets_v3.mdx b/docs/source/porting_datasets_v3.mdx index 593de0bae..36e5f93cb 100644 --- a/docs/source/porting_datasets_v3.mdx +++ b/docs/source/porting_datasets_v3.mdx @@ -7,6 +7,7 @@ This tutorial explains how to port large-scale robotic datasets to the LeRobot D Dataset v3.0 fundamentally changes how data is organized and stored: **v2.1 Structure (Episode-based)**: + ``` dataset/ ├── data/chunk-000/episode_000000.parquet @@ -16,6 +17,7 @@ dataset/ ``` **v3.0 Structure (File-based)**: + ``` dataset/ ├── data/chunk-000/file-000.parquet # Multiple episodes per file @@ -30,16 +32,19 @@ This transition from individual episode files to file-based chunks dramatically Dataset v3.0 introduces significant improvements for handling large datasets: ### 🏗️ **Enhanced File Organization** + - **File-based structure**: Episodes are now grouped into chunked files rather than individual episode files - **Configurable file sizes**: for data and video files - **Improved storage efficiency**: Better compression and reduced overhead ### 📊 **Modern Metadata Management** + - **Parquet-based metadata**: Replaced JSON Lines with efficient parquet format - **Structured episode access**: Direct pandas DataFrame access via `dataset.meta.episodes` - **Per-episode statistics**: Enhanced statistics tracking at episode level ### 🚀 **Performance Enhancements** + - **Memory-mapped access**: Improved RAM usage through PyArrow memory mapping - **Faster loading**: Significantly reduced dataset initialization time - **Better scalability**: Designed for datasets with millions of episodes @@ -56,6 +61,7 @@ Before porting large datasets, ensure you have: ## Understanding the DROID Dataset [DROID 1.0.1](https://droid-dataset.github.io/droid/the-droid-dataset) is an excellent example of a large-scale robotic dataset: + - **Size**: 1.7TB (RLDS format), 8.7TB (raw data) - **Structure**: 2048 pre-defined TensorFlow dataset shards - **Content**: 76,000+ robot manipulation trajectories from Franka Emika Panda robots @@ -64,6 +70,7 @@ Before porting large datasets, ensure you have: - **Hosting**: Google Cloud Storage with public access via `gsutil` The dataset contains diverse manipulation demonstrations with: + - Multiple camera views (wrist camera, exterior cameras) - Natural language task descriptions - Robot proprioceptive state and actions @@ -109,6 +116,7 @@ DROID_FEATURES = { ### Step 1: Install Dependencies For DROID specifically: + ```bash pip install tensorflow pip install tensorflow_datasets @@ -133,6 +141,7 @@ gsutil -m cp -r gs://gresearch/robotics/droid_100 /your/data/ > [!WARNING] > Large datasets require substantial time and storage: +> > - **Full DROID (1.7TB)**: Several days to download depending on bandwidth > - **Processing time**: 7+ days for local porting of full dataset > - **Upload time**: 3+ days to push to Hugging Face Hub @@ -150,6 +159,7 @@ python examples/port_datasets/droid_rlds/port_droid.py \ ### Development and Testing For development, you can port a single shard: + ```bash python examples/port_datasets/droid_rlds/port_droid.py \ --raw-dir /your/data/droid/1.0.1 \ @@ -173,6 +183,7 @@ pip install datatrove # Hugging Face's distributed processing library ### Step 2: Configure Your SLURM Environment Find your partition information: + ```bash sinfo --format="%R" # List available partitions sinfo -N -p your_partition -h -o "%N cpus=%c mem=%m" # Check resources @@ -206,21 +217,25 @@ python examples/port_datasets/droid_rlds/slurm_port_shards.py \ ### Step 4: Monitor Progress Check running jobs: + ```bash squeue -u $USER ``` Monitor overall progress: + ```bash jobs_status /your/logs ``` Inspect individual job logs: + ```bash less /your/logs/port_droid/slurm_jobs/JOB_ID_WORKER_ID.out ``` Debug failed jobs: + ```bash failed_logs /your/logs/port_droid ``` @@ -280,8 +295,6 @@ dataset/ This replaces the old episode-per-file structure with efficient, optimally-sized chunks. - - ## Migrating from Dataset v2.1 If you have existing datasets in v2.1 format, use the migration tool: @@ -292,6 +305,7 @@ python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \ ``` This automatically: + - Converts file structure to v3.0 format - Migrates metadata from JSON Lines to parquet - Aggregates statistics and creates per-episode stats diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 7d578e77b..bc1b4240d 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -18,7 +18,7 @@ import logging import shutil import tempfile from pathlib import Path -from typing import Callable +from collections.abc import Callable import datasets import numpy as np diff --git a/src/lerobot/datasets/sampler.py b/src/lerobot/datasets/sampler.py index 02fdc63de..b2c311bec 100644 --- a/src/lerobot/datasets/sampler.py +++ b/src/lerobot/datasets/sampler.py @@ -13,7 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterator, Union +from typing import Union +from collections.abc import Iterator import torch @@ -23,7 +24,7 @@ class EpisodeAwareSampler: self, dataset_from_indices: list[int], dataset_to_indices: list[int], - episode_indices_to_use: Union[list, None] = None, + episode_indices_to_use: list | None = None, drop_n_first_frames: int = 0, drop_n_last_frames: int = 0, shuffle: bool = False, diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index b0f6c15c2..59d45071b 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -345,7 +345,7 @@ def get_audio_info(video_path: Path | str) -> dict: "json", str(video_path), ] - result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(ffprobe_audio_cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Error running ffprobe: {result.stderr}") @@ -381,7 +381,7 @@ def get_video_info(video_path: Path | str) -> dict: "json", str(video_path), ] - result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(ffprobe_video_cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Error running ffprobe: {result.stderr}")