diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index d5ceb69c7..cd248262c 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -22,7 +22,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub - Check consistency between these new stats and the old ones. - Remove the deprecated `stats.json`. - Update codebase_version in `info.json`. -- Push this new version to the hub on the 'main' branch and tags it with "v2.1". +- Push this new version to the hub on the 'main' branch and tags it with "v3.0". Usage: @@ -40,9 +40,8 @@ from typing import Any import jsonlines import pandas as pd -import pyarrow as pa import tqdm -from datasets import Dataset, Features, Image +from datasets import Dataset, Image, concatenate_datasets from huggingface_hub import HfApi, snapshot_download from requests import HTTPError @@ -153,24 +152,21 @@ def convert_tasks(root, new_root): def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys): - # TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets - dataframes = [pd.read_parquet(file) for file in paths_to_cat] - # Concatenate all DataFrames along rows - concatenated_df = pd.concat(dataframes, ignore_index=True) + # Save RAM by using Dataset.from_parquet and concatenate_datasets + datasets = [Dataset.from_parquet(file) for file in paths_to_cat] + concatenated_dataset = concatenate_datasets(datasets) path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx) path.parent.mkdir(parents=True, exist_ok=True) if len(image_keys) > 0: - schema = pa.Schema.from_pandas(concatenated_df) - features = Features.from_arrow_schema(schema) + # Handle image features by casting to the appropriate feature types + features = concatenated_dataset.features.copy() for key in image_keys: features[key] = Image() - schema = features.arrow_schema - else: - schema = None + concatenated_dataset = concatenated_dataset.cast(features) - concatenated_df.to_parquet(path, index=False, schema=schema) + concatenated_dataset.to_parquet(path) def convert_data(root, new_root): diff --git a/tests/artifacts/datasets/save_dataset_to_safetensors.py b/tests/artifacts/datasets/save_dataset_to_safetensors.py index 9b5af152c..f8f3ff7ae 100644 --- a/tests/artifacts/datasets/save_dataset_to_safetensors.py +++ b/tests/artifacts/datasets/save_dataset_to_safetensors.py @@ -67,24 +67,6 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"): save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors") save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors") - # TODO(rcadene): Enable testing on second and last episode - # We currently cant because our test dataset only contains the first episode - - # # save 2 first frames of second episode - # i = dataset.meta.episodes["dataset_from_index"][1].item() - # save_file(dataset[i], repo_dir / f"frame_{i}.safetensors") - # save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors") - - # # save 2 last frames of second episode - # i = dataset.meta.episodes["dataset_to_index"][1].item() - # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors") - # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors") - - # # save 2 last frames of last episode - # i = dataset.meta.episodes["dataset_to_index"][-1].item() - # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors") - # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors") - if __name__ == "__main__": for dataset in [ diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 33c92e344..c798c6a2a 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -527,24 +527,6 @@ def test_backward_compatibility(repo_id): load_and_compare(i - 2) load_and_compare(i - 1) - # TODO(rcadene): Enable testing on second and last episode - # We currently cant because our test dataset only contains the first episode - - # # test 2 first frames of second episode - # i = dataset.meta.episodes["dataset_from_index"][1].item() - # load_and_compare(i) - # load_and_compare(i + 1) - - # # test 2 last frames of second episode - # i = dataset.meta.episodes["dataset_to_index"][1].item() - # load_and_compare(i - 2) - # load_and_compare(i - 1) - - # # test 2 last frames of last episode - # i = dataset.meta.episodes["dataset_to_index"][-1].item() - # load_and_compare(i - 2) - # load_and_compare(i - 1) - @pytest.mark.skip("Requires internet access") def test_create_branch(): diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py index 3f93b3d65..6ea9a8105 100644 --- a/tests/fixtures/dataset_factories.py +++ b/tests/fixtures/dataset_factories.py @@ -218,25 +218,6 @@ def stats_factory(): return _create_stats -# @pytest.fixture(scope="session") -# def episodes_stats_factory(stats_factory): -# def _create_episodes_stats( -# features: dict[str], -# total_episodes: int = 3, -# ) -> dict: - -# def _generator(total_episodes): -# for ep_idx in range(total_episodes): -# flat_ep_stats = flatten_dict(stats_factory(features)) -# flat_ep_stats["episode_index"] = ep_idx -# yield flat_ep_stats - -# # Simpler to rely on generator instead of from_dict -# return Dataset.from_generator(lambda: _generator(total_episodes)) - -# return _create_episodes_stats - - @pytest.fixture(scope="session") def tasks_factory(): def _create_tasks(total_tasks: int = 3) -> pd.DataFrame: