From 9287c36f379955850bbd4ffb3303044ba03d44b0 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Sun, 6 Jul 2025 22:28:30 +0200 Subject: [PATCH] - Added missing license in the new scripts - Added back legacy functions in conversion script of v2 to v21 - Updated README description for dataset_v3 --- README.md | 38 ++++++++++++------- .../agibot_hdf5/slurm_port_shards.py | 16 ++++++++ .../droid_rlds/display_error_files.py | 16 ++++++++ .../port_datasets/droid_rlds/port_droid.py | 2 +- .../droid_rlds/slurm_aggregate_shards.py | 2 +- .../droid_rlds/slurm_port_shards.py | 16 ++++++++ .../port_datasets/droid_rlds/slurm_upload.py | 16 ++++++++ .../v21/convert_dataset_v20_to_v21.py | 27 +++++++++++-- .../v30/convert_dataset_v21_to_v30.py | 18 ++++++++- tests/datasets/test_aggregate.py | 16 ++++++++ 10 files changed, 146 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 153a3a215..ea343d413 100644 --- a/README.md +++ b/README.md @@ -200,20 +200,30 @@ dataset attributes: │ ├ timestamp (float32): timestamp in the episode │ ├ next.done (bool): indicates the end of an episode ; True for the last frame in each episode │ └ index (int64): general index in the whole dataset - ├ episode_data_index: contains 2 tensors with the start and end indices of each episode - │ ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0 - │ └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,) - ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance - │ ├ observation.images.cam_high: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.} - │ ... - ├ info: a dictionary of metadata on the dataset - │ ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with - │ ├ fps (float): frame per second the dataset is recorded/synchronized to - │ ├ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files - │ └ encoding (dict): if video, this documents the main options that were used with ffmpeg to encode the videos - ├ videos_dir (Path): where the mp4 videos or png images are stored/accessed - └ camera_keys (list of string): the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`) -``` + ├ meta: a LeRobotDatasetMetadata object containing: + │ ├ info: a dictionary of metadata on the dataset + │ │ ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with + │ │ ├ fps (int): frame per second the dataset is recorded/synchronized to + │ │ ├ features (dict): all features contained in the dataset with their shapes and types + │ │ ├ total_episodes (int): total number of episodes in the dataset + │ │ ├ total_frames (int): total number of frames in the dataset + │ │ ├ robot_type (str): robot type used for recording + │ │ ├ data_path (str): formattable string for the parquet files + │ │ └ video_path (str): formattable string for the video files (if using videos) + │ ├ episodes: a DataFrame containing episode metadata with columns: + │ │ ├ episode_index (int): index of the episode + │ │ ├ tasks (list): list of tasks for this episode + │ │ ├ length (int): number of frames in this episode + │ │ ├ dataset_from_index (int): start index of this episode in the dataset + │ │ └ dataset_to_index (int): end index of this episode in the dataset + │ ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance + │ │ ├ observation.images.front_cam: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.} + │ │ └ ... + │ └ tasks: a DataFrame containing task information with task names as index and task_index as values + ├ root (Path): local directory where the dataset is stored + ├ image_transforms (Callable): optional image transformations to apply to visual modalities + └ delta_timestamps (dict): optional delta timestamps for temporal queries +decoding videos (e.g., 'pyav', 'torchcodec') A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely: - hf_dataset stored using Hugging Face datasets library serialization to parquet diff --git a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py index 4ce79bafb..9c2587e5f 100644 --- a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py +++ b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import logging import tarfile diff --git a/examples/port_datasets/droid_rlds/display_error_files.py b/examples/port_datasets/droid_rlds/display_error_files.py index cc6395481..fffab5ff3 100644 --- a/examples/port_datasets/droid_rlds/display_error_files.py +++ b/examples/port_datasets/droid_rlds/display_error_files.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import json from pathlib import Path diff --git a/examples/port_datasets/droid_rlds/port_droid.py b/examples/port_datasets/droid_rlds/port_droid.py index 20e0199c8..4efb131e4 100644 --- a/examples/port_datasets/droid_rlds/port_droid.py +++ b/examples/port_datasets/droid_rlds/port_droid.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py index 692d243da..56dbba230 100644 --- a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py +++ b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/port_datasets/droid_rlds/slurm_port_shards.py b/examples/port_datasets/droid_rlds/slurm_port_shards.py index 602b1f40b..c29d8e94e 100644 --- a/examples/port_datasets/droid_rlds/slurm_port_shards.py +++ b/examples/port_datasets/droid_rlds/slurm_port_shards.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse from pathlib import Path diff --git a/examples/port_datasets/droid_rlds/slurm_upload.py b/examples/port_datasets/droid_rlds/slurm_upload.py index 34bb40df9..91d1fc628 100644 --- a/examples/port_datasets/droid_rlds/slurm_upload.py +++ b/examples/port_datasets/droid_rlds/slurm_upload.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import logging import os diff --git a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py index ae94c4e02..63920d5a2 100644 --- a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py +++ b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py @@ -33,16 +33,38 @@ python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \ import argparse import logging +from pathlib import Path +import jsonlines from huggingface_hub import HfApi from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset -from lerobot.datasets.utils import LEGACY_EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info +from lerobot.datasets.utils import STATS_PATH, load_stats, serialize_dict, write_info from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats V20 = "v2.0" V21 = "v2.1" +### LEGACY FUNCTIONS REMOVED FROM UTILS ### + +LEGACY_EPISODES_STATS_PATH = "episodes_stats.jsonl" + + +def append_jsonlines(data: dict, fpath: Path) -> None: + fpath.parent.mkdir(exist_ok=True, parents=True) + with jsonlines.open(fpath, "a") as writer: + writer.write(data) + + +def legacy_write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path): + # We wrap episode_stats in a dictionary since `episode_stats["episode_index"]` + # is a dictionary of stats and not an integer. + episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)} + append_jsonlines(episode_stats, local_dir / LEGACY_EPISODES_STATS_PATH) + + +######## END OF LEGACY FUNCTIONS ######## + class SuppressWarnings: def __enter__(self): @@ -61,9 +83,6 @@ def convert_dataset( with SuppressWarnings(): dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True) - if (dataset.root / LEGACY_EPISODES_STATS_PATH).is_file(): - (dataset.root / LEGACY_EPISODES_STATS_PATH).unlink() - convert_stats(dataset, num_workers=num_workers) ref_stats = load_stats(dataset.root) check_aggregate_stats(dataset, ref_stats) diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index 739a87786..c6bbf97e0 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.1 to 3.0. It will: @@ -11,7 +27,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub Usage: ```bash -python lerobot/datasets/v30/convert_dataset_v21_to_v30.py \ +python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \ --repo-id=lerobot/pusht ``` diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py index 6a1b3b9ff..9d75ece38 100644 --- a/tests/datasets/test_aggregate.py +++ b/tests/datasets/test_aggregate.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch from lerobot.datasets.aggregate import aggregate_datasets