Add variable-size test datasets (#1610)

* fix: dummy datasets can be written to multiple files in multiple folders based on arbitrary data size

* fix: writing atomic episodes to multiple files (maybe)

* fix: moving unused write dataset function to test code
This commit is contained in:
Francesco Capuano
2025-07-30 11:26:28 +02:00
committed by GitHub
parent 890b1e473d
commit 527ae8e557
4 changed files with 114 additions and 14 deletions
+6 -2
View File
@@ -19,6 +19,8 @@ import pytest
from huggingface_hub.utils import filter_repo_objects
from lerobot.datasets.utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_DATA_PATH,
DEFAULT_EPISODES_PATH,
DEFAULT_TASKS_PATH,
@@ -54,9 +56,11 @@ def mock_snapshot_download_factory(
tasks: pd.DataFrame | None = None,
episodes: datasets.Dataset | None = None,
hf_dataset: datasets.Dataset | None = None,
data_files_size_in_mb: float = DEFAULT_DATA_FILE_SIZE_IN_MB,
chunks_size: int = DEFAULT_CHUNK_SIZE,
):
if info is None:
info = info_factory()
info = info_factory(data_files_size_in_mb=data_files_size_in_mb, chunks_size=chunks_size)
if stats is None:
stats = stats_factory(features=info["features"])
if tasks is None:
@@ -132,7 +136,7 @@ def mock_snapshot_download_factory(
if request_episodes:
create_episodes(local_dir, episodes)
if request_data:
create_hf_dataset(local_dir, hf_dataset)
create_hf_dataset(local_dir, hf_dataset, data_files_size_in_mb, chunks_size)
if request_videos:
create_videos(root=local_dir, info=info)