From 9287c36f379955850bbd4ffb3303044ba03d44b0 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Sun, 6 Jul 2025 22:28:30 +0200
Subject: [PATCH]  - Added missing license in the new scripts  - Added back
 legacy functions in conversion script of v2 to v21  - Updated README
 description for dataset_v3

---
 README.md                                     | 38 ++++++++++++-------
 .../agibot_hdf5/slurm_port_shards.py          | 16 ++++++++
 .../droid_rlds/display_error_files.py         | 16 ++++++++
 .../port_datasets/droid_rlds/port_droid.py    |  2 +-
 .../droid_rlds/slurm_aggregate_shards.py      |  2 +-
 .../droid_rlds/slurm_port_shards.py           | 16 ++++++++
 .../port_datasets/droid_rlds/slurm_upload.py  | 16 ++++++++
 .../v21/convert_dataset_v20_to_v21.py         | 27 +++++++++++--
 .../v30/convert_dataset_v21_to_v30.py         | 18 ++++++++-
 tests/datasets/test_aggregate.py              | 16 ++++++++
 10 files changed, 146 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 153a3a215..ea343d413 100644
--- a/README.md
+++ b/README.md
@@ -200,20 +200,30 @@ dataset attributes:
   │  ├ timestamp (float32): timestamp in the episode
   │  ├ next.done (bool): indicates the end of an episode ; True for the last frame in each episode
   │  └ index (int64): general index in the whole dataset
-  ├ episode_data_index: contains 2 tensors with the start and end indices of each episode
-  │  ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0
-  │  └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,)
-  ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
-  │  ├ observation.images.cam_high: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.}
-  │  ...
-  ├ info: a dictionary of metadata on the dataset
-  │  ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with
-  │  ├ fps (float): frame per second the dataset is recorded/synchronized to
-  │  ├ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files
-  │  └ encoding (dict): if video, this documents the main options that were used with ffmpeg to encode the videos
-  ├ videos_dir (Path): where the mp4 videos or png images are stored/accessed
-  └ camera_keys (list of string): the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`)
-```
+  ├ meta: a LeRobotDatasetMetadata object containing:
+  │  ├ info: a dictionary of metadata on the dataset
+  │  │  ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with
+  │  │  ├ fps (int): frame per second the dataset is recorded/synchronized to
+  │  │  ├ features (dict): all features contained in the dataset with their shapes and types
+  │  │  ├ total_episodes (int): total number of episodes in the dataset
+  │  │  ├ total_frames (int): total number of frames in the dataset
+  │  │  ├ robot_type (str): robot type used for recording
+  │  │  ├ data_path (str): formattable string for the parquet files
+  │  │  └ video_path (str): formattable string for the video files (if using videos)
+  │  ├ episodes: a DataFrame containing episode metadata with columns:
+  │  │  ├ episode_index (int): index of the episode
+  │  │  ├ tasks (list): list of tasks for this episode
+  │  │  ├ length (int): number of frames in this episode
+  │  │  ├ dataset_from_index (int): start index of this episode in the dataset
+  │  │  └ dataset_to_index (int): end index of this episode in the dataset
+  │  ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
+  │  │  ├ observation.images.front_cam: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.}
+  │  │  └ ...
+  │  └ tasks: a DataFrame containing task information with task names as index and task_index as values
+  ├ root (Path): local directory where the dataset is stored
+  ├ image_transforms (Callable): optional image transformations to apply to visual modalities
+  └ delta_timestamps (dict): optional delta timestamps for temporal queries
+decoding videos (e.g., 'pyav', 'torchcodec')
 
 A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely:
 - hf_dataset stored using Hugging Face datasets library serialization to parquet
diff --git a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
index 4ce79bafb..9c2587e5f 100644
--- a/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
+++ b/examples/port_datasets/agibot_hdf5/slurm_port_shards.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import logging
 import tarfile
diff --git a/examples/port_datasets/droid_rlds/display_error_files.py b/examples/port_datasets/droid_rlds/display_error_files.py
index cc6395481..fffab5ff3 100644
--- a/examples/port_datasets/droid_rlds/display_error_files.py
+++ b/examples/port_datasets/droid_rlds/display_error_files.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 from pathlib import Path
diff --git a/examples/port_datasets/droid_rlds/port_droid.py b/examples/port_datasets/droid_rlds/port_droid.py
index 20e0199c8..4efb131e4 100644
--- a/examples/port_datasets/droid_rlds/port_droid.py
+++ b/examples/port_datasets/droid_rlds/port_droid.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
index 692d243da..56dbba230 100644
--- a/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
+++ b/examples/port_datasets/droid_rlds/slurm_aggregate_shards.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/port_datasets/droid_rlds/slurm_port_shards.py b/examples/port_datasets/droid_rlds/slurm_port_shards.py
index 602b1f40b..c29d8e94e 100644
--- a/examples/port_datasets/droid_rlds/slurm_port_shards.py
+++ b/examples/port_datasets/droid_rlds/slurm_port_shards.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path
 
diff --git a/examples/port_datasets/droid_rlds/slurm_upload.py b/examples/port_datasets/droid_rlds/slurm_upload.py
index 34bb40df9..91d1fc628 100644
--- a/examples/port_datasets/droid_rlds/slurm_upload.py
+++ b/examples/port_datasets/droid_rlds/slurm_upload.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import logging
 import os
diff --git a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py
index ae94c4e02..63920d5a2 100644
--- a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py
+++ b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py
@@ -33,16 +33,38 @@ python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \
 
 import argparse
 import logging
+from pathlib import Path
 
+import jsonlines
 from huggingface_hub import HfApi
 
 from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.datasets.utils import LEGACY_EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
+from lerobot.datasets.utils import STATS_PATH, load_stats, serialize_dict, write_info
 from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
 
 V20 = "v2.0"
 V21 = "v2.1"
 
+### LEGACY FUNCTIONS REMOVED FROM UTILS ###
+
+LEGACY_EPISODES_STATS_PATH = "episodes_stats.jsonl"
+
+
+def append_jsonlines(data: dict, fpath: Path) -> None:
+    fpath.parent.mkdir(exist_ok=True, parents=True)
+    with jsonlines.open(fpath, "a") as writer:
+        writer.write(data)
+
+
+def legacy_write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path):
+    # We wrap episode_stats in a dictionary since `episode_stats["episode_index"]`
+    # is a dictionary of stats and not an integer.
+    episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)}
+    append_jsonlines(episode_stats, local_dir / LEGACY_EPISODES_STATS_PATH)
+
+
+######## END OF LEGACY FUNCTIONS ########
+
 
 class SuppressWarnings:
     def __enter__(self):
@@ -61,9 +83,6 @@ def convert_dataset(
     with SuppressWarnings():
         dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True)
 
-    if (dataset.root / LEGACY_EPISODES_STATS_PATH).is_file():
-        (dataset.root / LEGACY_EPISODES_STATS_PATH).unlink()
-
     convert_stats(dataset, num_workers=num_workers)
     ref_stats = load_stats(dataset.root)
     check_aggregate_stats(dataset, ref_stats)
diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py
index 739a87786..c6bbf97e0 100644
--- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py
+++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.1 to
 3.0. It will:
@@ -11,7 +27,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub
 Usage:
 
 ```bash
-python lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
+python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
     --repo-id=lerobot/pusht
 ```
 
diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py
index 6a1b3b9ff..9d75ece38 100644
--- a/tests/datasets/test_aggregate.py
+++ b/tests/datasets/test_aggregate.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from lerobot.datasets.aggregate import aggregate_datasets