feat(convert_dataset_v21_to_v3) added the use of more efficient Dataset.from_parquet and concatenate_datasets

This commit is contained in:
Michel Aractingi
2025-07-22 17:27:41 +02:00
parent 670d7f485f
commit 218ebed3ef
4 changed files with 9 additions and 68 deletions
@@ -22,7 +22,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub
- Check consistency between these new stats and the old ones. - Check consistency between these new stats and the old ones.
- Remove the deprecated `stats.json`. - Remove the deprecated `stats.json`.
- Update codebase_version in `info.json`. - Update codebase_version in `info.json`.
- Push this new version to the hub on the 'main' branch and tags it with "v2.1". - Push this new version to the hub on the 'main' branch and tags it with "v3.0".
Usage: Usage:
@@ -40,9 +40,8 @@ from typing import Any
import jsonlines import jsonlines
import pandas as pd import pandas as pd
import pyarrow as pa
import tqdm import tqdm
from datasets import Dataset, Features, Image from datasets import Dataset, Image, concatenate_datasets
from huggingface_hub import HfApi, snapshot_download from huggingface_hub import HfApi, snapshot_download
from requests import HTTPError from requests import HTTPError
@@ -153,24 +152,21 @@ def convert_tasks(root, new_root):
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys): def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
# TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets # Save RAM by using Dataset.from_parquet and concatenate_datasets
dataframes = [pd.read_parquet(file) for file in paths_to_cat] datasets = [Dataset.from_parquet(file) for file in paths_to_cat]
# Concatenate all DataFrames along rows concatenated_dataset = concatenate_datasets(datasets)
concatenated_df = pd.concat(dataframes, ignore_index=True)
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx) path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
if len(image_keys) > 0: if len(image_keys) > 0:
schema = pa.Schema.from_pandas(concatenated_df) # Handle image features by casting to the appropriate feature types
features = Features.from_arrow_schema(schema) features = concatenated_dataset.features.copy()
for key in image_keys: for key in image_keys:
features[key] = Image() features[key] = Image()
schema = features.arrow_schema concatenated_dataset = concatenated_dataset.cast(features)
else:
schema = None
concatenated_df.to_parquet(path, index=False, schema=schema) concatenated_dataset.to_parquet(path)
def convert_data(root, new_root): def convert_data(root, new_root):
@@ -67,24 +67,6 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors") save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors") save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
# TODO(rcadene): Enable testing on second and last episode
# We currently cant because our test dataset only contains the first episode
# # save 2 first frames of second episode
# i = dataset.meta.episodes["dataset_from_index"][1].item()
# save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
# save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
# # save 2 last frames of second episode
# i = dataset.meta.episodes["dataset_to_index"][1].item()
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
# # save 2 last frames of last episode
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
if __name__ == "__main__": if __name__ == "__main__":
for dataset in [ for dataset in [
-18
View File
@@ -527,24 +527,6 @@ def test_backward_compatibility(repo_id):
load_and_compare(i - 2) load_and_compare(i - 2)
load_and_compare(i - 1) load_and_compare(i - 1)
# TODO(rcadene): Enable testing on second and last episode
# We currently cant because our test dataset only contains the first episode
# # test 2 first frames of second episode
# i = dataset.meta.episodes["dataset_from_index"][1].item()
# load_and_compare(i)
# load_and_compare(i + 1)
# # test 2 last frames of second episode
# i = dataset.meta.episodes["dataset_to_index"][1].item()
# load_and_compare(i - 2)
# load_and_compare(i - 1)
# # test 2 last frames of last episode
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
# load_and_compare(i - 2)
# load_and_compare(i - 1)
@pytest.mark.skip("Requires internet access") @pytest.mark.skip("Requires internet access")
def test_create_branch(): def test_create_branch():
-19
View File
@@ -218,25 +218,6 @@ def stats_factory():
return _create_stats return _create_stats
# @pytest.fixture(scope="session")
# def episodes_stats_factory(stats_factory):
# def _create_episodes_stats(
# features: dict[str],
# total_episodes: int = 3,
# ) -> dict:
# def _generator(total_episodes):
# for ep_idx in range(total_episodes):
# flat_ep_stats = flatten_dict(stats_factory(features))
# flat_ep_stats["episode_index"] = ep_idx
# yield flat_ep_stats
# # Simpler to rely on generator instead of from_dict
# return Dataset.from_generator(lambda: _generator(total_episodes))
# return _create_episodes_stats
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tasks_factory(): def tasks_factory():
def _create_tasks(total_tasks: int = 3) -> pd.DataFrame: def _create_tasks(total_tasks: int = 3) -> pd.DataFrame: