feat(convert_dataset_v21_to_v3) added the use of more efficient Dataset.from_parquet and concatenate_datasets

2026-07-24 18:26:11 +00:00 · 2025-07-22 17:27:41 +02:00
parent 670d7f485f
commit 218ebed3ef
4 changed files with 9 additions and 68 deletions
@@ -22,7 +22,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub
 - Check consistency between these new stats and the old ones.
 - Remove the deprecated `stats.json`.
 - Update codebase_version in `info.json`.
- Push this new version to the hub on the 'main' branch and tags it with "v2.1".
+- Push this new version to the hub on the 'main' branch and tags it with "v3.0".

 Usage:

@@ -40,9 +40,8 @@ from typing import Any

 import jsonlines
 import pandas as pd
-import pyarrow as pa
 import tqdm
-from datasets import Dataset, Features, Image
+from datasets import Dataset, Image, concatenate_datasets
 from huggingface_hub import HfApi, snapshot_download
 from requests import HTTPError

@@ -153,24 +152,21 @@ def convert_tasks(root, new_root):


 def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
-    # TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
-    dataframes = [pd.read_parquet(file) for file in paths_to_cat]
-    # Concatenate all DataFrames along rows
-    concatenated_df = pd.concat(dataframes, ignore_index=True)
+    # Save RAM by using Dataset.from_parquet and concatenate_datasets
+    datasets = [Dataset.from_parquet(file) for file in paths_to_cat]
+    concatenated_dataset = concatenate_datasets(datasets)

    path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
    path.parent.mkdir(parents=True, exist_ok=True)

    if len(image_keys) > 0:
-        schema = pa.Schema.from_pandas(concatenated_df)
-        features = Features.from_arrow_schema(schema)
+        # Handle image features by casting to the appropriate feature types
+        features = concatenated_dataset.features.copy()
        for key in image_keys:
            features[key] = Image()
-        schema = features.arrow_schema
-    else:
-        schema = None
+        concatenated_dataset = concatenated_dataset.cast(features)

-    concatenated_df.to_parquet(path, index=False, schema=schema)
+    concatenated_dataset.to_parquet(path)


 def convert_data(root, new_root):
@@ -67,24 +67,6 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
    save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
    save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")

-    # TODO(rcadene): Enable testing on second and last episode
-    # We currently cant because our test dataset only contains the first episode
-
-    # # save 2 first frames of second episode
-    # i = dataset.meta.episodes["dataset_from_index"][1].item()
-    # save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
-    # save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
-
-    # # save 2 last frames of second episode
-    # i = dataset.meta.episodes["dataset_to_index"][1].item()
-    # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
-    # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
-
-    # # save 2 last frames of last episode
-    # i = dataset.meta.episodes["dataset_to_index"][-1].item()
-    # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
-    # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
-

 if __name__ == "__main__":
    for dataset in [
@@ -527,24 +527,6 @@ def test_backward_compatibility(repo_id):
    load_and_compare(i - 2)
    load_and_compare(i - 1)

-    # TODO(rcadene): Enable testing on second and last episode
-    # We currently cant because our test dataset only contains the first episode
-
-    # # test 2 first frames of second episode
-    # i = dataset.meta.episodes["dataset_from_index"][1].item()
-    # load_and_compare(i)
-    # load_and_compare(i + 1)
-
-    # # test 2 last frames of second episode
-    # i = dataset.meta.episodes["dataset_to_index"][1].item()
-    # load_and_compare(i - 2)
-    # load_and_compare(i - 1)
-
-    # # test 2 last frames of last episode
-    # i = dataset.meta.episodes["dataset_to_index"][-1].item()
-    # load_and_compare(i - 2)
-    # load_and_compare(i - 1)
-

@pytest.mark.skip("Requires internet access")
 def test_create_branch():
@@ -218,25 +218,6 @@ def stats_factory():
    return _create_stats


-# @pytest.fixture(scope="session")
-# def episodes_stats_factory(stats_factory):
-#     def _create_episodes_stats(
-#         features: dict[str],
-#         total_episodes: int = 3,
-#     ) -> dict:
-
-#         def _generator(total_episodes):
-#             for ep_idx in range(total_episodes):
-#                 flat_ep_stats = flatten_dict(stats_factory(features))
-#                 flat_ep_stats["episode_index"] = ep_idx
-#                 yield flat_ep_stats
-
-#         # Simpler to rely on generator instead of from_dict
-#         return Dataset.from_generator(lambda: _generator(total_episodes))
-
-#     return _create_episodes_stats
-
-
@pytest.fixture(scope="session")
 def tasks_factory():
    def _create_tasks(total_tasks: int = 3) -> pd.DataFrame: