mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 19:19:56 +00:00
feat(convert_dataset_v21_to_v3) added the use of more efficient Dataset.from_parquet and concatenate_datasets
This commit is contained in:
@@ -22,7 +22,7 @@ This script will help you convert any LeRobot dataset already pushed to the hub
|
|||||||
- Check consistency between these new stats and the old ones.
|
- Check consistency between these new stats and the old ones.
|
||||||
- Remove the deprecated `stats.json`.
|
- Remove the deprecated `stats.json`.
|
||||||
- Update codebase_version in `info.json`.
|
- Update codebase_version in `info.json`.
|
||||||
- Push this new version to the hub on the 'main' branch and tags it with "v2.1".
|
- Push this new version to the hub on the 'main' branch and tags it with "v3.0".
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
@@ -40,9 +40,8 @@ from typing import Any
|
|||||||
|
|
||||||
import jsonlines
|
import jsonlines
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyarrow as pa
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from datasets import Dataset, Features, Image
|
from datasets import Dataset, Image, concatenate_datasets
|
||||||
from huggingface_hub import HfApi, snapshot_download
|
from huggingface_hub import HfApi, snapshot_download
|
||||||
from requests import HTTPError
|
from requests import HTTPError
|
||||||
|
|
||||||
@@ -153,24 +152,21 @@ def convert_tasks(root, new_root):
|
|||||||
|
|
||||||
|
|
||||||
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
|
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
|
||||||
# TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
|
# Save RAM by using Dataset.from_parquet and concatenate_datasets
|
||||||
dataframes = [pd.read_parquet(file) for file in paths_to_cat]
|
datasets = [Dataset.from_parquet(file) for file in paths_to_cat]
|
||||||
# Concatenate all DataFrames along rows
|
concatenated_dataset = concatenate_datasets(datasets)
|
||||||
concatenated_df = pd.concat(dataframes, ignore_index=True)
|
|
||||||
|
|
||||||
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
if len(image_keys) > 0:
|
if len(image_keys) > 0:
|
||||||
schema = pa.Schema.from_pandas(concatenated_df)
|
# Handle image features by casting to the appropriate feature types
|
||||||
features = Features.from_arrow_schema(schema)
|
features = concatenated_dataset.features.copy()
|
||||||
for key in image_keys:
|
for key in image_keys:
|
||||||
features[key] = Image()
|
features[key] = Image()
|
||||||
schema = features.arrow_schema
|
concatenated_dataset = concatenated_dataset.cast(features)
|
||||||
else:
|
|
||||||
schema = None
|
|
||||||
|
|
||||||
concatenated_df.to_parquet(path, index=False, schema=schema)
|
concatenated_dataset.to_parquet(path)
|
||||||
|
|
||||||
|
|
||||||
def convert_data(root, new_root):
|
def convert_data(root, new_root):
|
||||||
|
|||||||
@@ -67,24 +67,6 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
|
|||||||
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
|
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
|
||||||
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
|
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
|
||||||
|
|
||||||
# TODO(rcadene): Enable testing on second and last episode
|
|
||||||
# We currently cant because our test dataset only contains the first episode
|
|
||||||
|
|
||||||
# # save 2 first frames of second episode
|
|
||||||
# i = dataset.meta.episodes["dataset_from_index"][1].item()
|
|
||||||
# save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
|
||||||
# save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
|
|
||||||
|
|
||||||
# # save 2 last frames of second episode
|
|
||||||
# i = dataset.meta.episodes["dataset_to_index"][1].item()
|
|
||||||
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
|
||||||
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
|
||||||
|
|
||||||
# # save 2 last frames of last episode
|
|
||||||
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
|
|
||||||
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
|
||||||
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for dataset in [
|
for dataset in [
|
||||||
|
|||||||
@@ -527,24 +527,6 @@ def test_backward_compatibility(repo_id):
|
|||||||
load_and_compare(i - 2)
|
load_and_compare(i - 2)
|
||||||
load_and_compare(i - 1)
|
load_and_compare(i - 1)
|
||||||
|
|
||||||
# TODO(rcadene): Enable testing on second and last episode
|
|
||||||
# We currently cant because our test dataset only contains the first episode
|
|
||||||
|
|
||||||
# # test 2 first frames of second episode
|
|
||||||
# i = dataset.meta.episodes["dataset_from_index"][1].item()
|
|
||||||
# load_and_compare(i)
|
|
||||||
# load_and_compare(i + 1)
|
|
||||||
|
|
||||||
# # test 2 last frames of second episode
|
|
||||||
# i = dataset.meta.episodes["dataset_to_index"][1].item()
|
|
||||||
# load_and_compare(i - 2)
|
|
||||||
# load_and_compare(i - 1)
|
|
||||||
|
|
||||||
# # test 2 last frames of last episode
|
|
||||||
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
|
|
||||||
# load_and_compare(i - 2)
|
|
||||||
# load_and_compare(i - 1)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Requires internet access")
|
@pytest.mark.skip("Requires internet access")
|
||||||
def test_create_branch():
|
def test_create_branch():
|
||||||
|
|||||||
Vendored
-19
@@ -218,25 +218,6 @@ def stats_factory():
|
|||||||
return _create_stats
|
return _create_stats
|
||||||
|
|
||||||
|
|
||||||
# @pytest.fixture(scope="session")
|
|
||||||
# def episodes_stats_factory(stats_factory):
|
|
||||||
# def _create_episodes_stats(
|
|
||||||
# features: dict[str],
|
|
||||||
# total_episodes: int = 3,
|
|
||||||
# ) -> dict:
|
|
||||||
|
|
||||||
# def _generator(total_episodes):
|
|
||||||
# for ep_idx in range(total_episodes):
|
|
||||||
# flat_ep_stats = flatten_dict(stats_factory(features))
|
|
||||||
# flat_ep_stats["episode_index"] = ep_idx
|
|
||||||
# yield flat_ep_stats
|
|
||||||
|
|
||||||
# # Simpler to rely on generator instead of from_dict
|
|
||||||
# return Dataset.from_generator(lambda: _generator(total_episodes))
|
|
||||||
|
|
||||||
# return _create_episodes_stats
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def tasks_factory():
|
def tasks_factory():
|
||||||
def _create_tasks(total_tasks: int = 3) -> pd.DataFrame:
|
def _create_tasks(total_tasks: int = 3) -> pd.DataFrame:
|
||||||
|
|||||||
Reference in New Issue
Block a user