diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py index 617b3a3e2..d57ca7f93 100644 --- a/src/lerobot/datasets/aggregate.py +++ b/src/lerobot/datasets/aggregate.py @@ -152,11 +152,11 @@ def update_meta_data( def aggregate_datasets( repo_ids: list[str], aggr_repo_id: str, - roots: list[Path] = None, - aggr_root: Path = None, - data_files_size_in_mb: float = None, - video_files_size_in_mb: float = None, - chunk_size: int = None, + roots: list[Path] | None = None, + aggr_root: Path | None = None, + data_files_size_in_mb: float | None = None, + video_files_size_in_mb: float | None = None, + chunk_size: int | None = None, ): """Aggregates multiple LeRobot datasets into a single unified dataset. diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index eb81fa531..99dbf606e 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -358,6 +358,53 @@ class LeRobotDatasetMetadata: video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key) self.info["features"][key]["info"] = get_video_info(video_path) + def update_chunk_settings( + self, + chunks_size: int | None = None, + data_files_size_in_mb: int | None = None, + video_files_size_in_mb: int | None = None, + ) -> None: + """Update chunk and file size settings after dataset creation. + + This allows users to customize storage organization without modifying the constructor. + These settings control how episodes are chunked and how large files can grow before + creating new ones. + + Args: + chunks_size: Maximum number of files per chunk directory. If None, keeps current value. + data_files_size_in_mb: Maximum size for data parquet files in MB. If None, keeps current value. + video_files_size_in_mb: Maximum size for video files in MB. If None, keeps current value. + """ + if chunks_size is not None: + if chunks_size <= 0: + raise ValueError(f"chunks_size must be positive, got {chunks_size}") + self.info["chunks_size"] = chunks_size + + if data_files_size_in_mb is not None: + if data_files_size_in_mb <= 0: + raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}") + self.info["data_files_size_in_mb"] = data_files_size_in_mb + + if video_files_size_in_mb is not None: + if video_files_size_in_mb <= 0: + raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}") + self.info["video_files_size_in_mb"] = video_files_size_in_mb + + # Update the info file on disk + write_info(self.info, self.root) + + def get_chunk_settings(self) -> dict[str, int]: + """Get current chunk and file size settings. + + Returns: + Dict containing chunks_size, data_files_size_in_mb, and video_files_size_in_mb. + """ + return { + "chunks_size": self.chunks_size, + "data_files_size_in_mb": self.data_files_size_in_mb, + "video_files_size_in_mb": self.video_files_size_in_mb, + } + def __repr__(self): feature_keys = list(self.features) return ( diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 2f6c990e4..605e17a97 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -540,6 +540,9 @@ def create_empty_dataset_info( features: dict, use_videos: bool, robot_type: str | None = None, + chunks_size: int | None = None, + data_files_size_in_mb: int | None = None, + video_files_size_in_mb: int | None = None, ) -> dict: return { "codebase_version": codebase_version, @@ -547,9 +550,9 @@ def create_empty_dataset_info( "total_episodes": 0, "total_frames": 0, "total_tasks": 0, - "chunks_size": DEFAULT_CHUNK_SIZE, - "data_files_size_in_mb": DEFAULT_DATA_FILE_SIZE_IN_MB, - "video_files_size_in_mb": DEFAULT_VIDEO_FILE_SIZE_IN_MB, + "chunks_size": chunks_size or DEFAULT_CHUNK_SIZE, + "data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB, + "video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB, "fps": fps, "splits": {}, "data_path": DEFAULT_DATA_PATH, diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index eb740c972..ffd033d4b 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -35,6 +35,9 @@ from lerobot.datasets.lerobot_dataset import ( MultiLeRobotDataset, ) from lerobot.datasets.utils import ( + DEFAULT_CHUNK_SIZE, + DEFAULT_DATA_FILE_SIZE_IN_MB, + DEFAULT_VIDEO_FILE_SIZE_IN_MB, create_branch, get_hf_features_from_features, hf_transform_to_torch, @@ -654,3 +657,135 @@ def test_check_cached_episodes_sufficient(tmp_path, lerobot_dataset_factory): # Test requesting a mix of available and unavailable episodes sparse_dataset.episodes = [0, 1, 2] assert sparse_dataset._check_cached_episodes_sufficient() is False + + +def test_update_chunk_settings(tmp_path, empty_lerobot_dataset_factory): + """Test the update_chunk_settings functionality for both LeRobotDataset and LeRobotDatasetMetadata.""" + features = { + "observation.state": { + "dtype": "float32", + "shape": (6,), + "names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"], + }, + "action": { + "dtype": "float32", + "shape": (6,), + "names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"], + }, + } + + # Create dataset with default chunk settings + dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features) + + # Test initial default values + initial_settings = dataset.meta.get_chunk_settings() + assert initial_settings["chunks_size"] == DEFAULT_CHUNK_SIZE + assert initial_settings["data_files_size_in_mb"] == DEFAULT_DATA_FILE_SIZE_IN_MB + assert initial_settings["video_files_size_in_mb"] == DEFAULT_VIDEO_FILE_SIZE_IN_MB + + # Test updating all settings at once + new_chunks_size = 2000 + new_data_size = 200 + new_video_size = 1000 + + dataset.meta.update_chunk_settings( + chunks_size=new_chunks_size, + data_files_size_in_mb=new_data_size, + video_files_size_in_mb=new_video_size, + ) + + # Verify settings were updated + updated_settings = dataset.meta.get_chunk_settings() + assert updated_settings["chunks_size"] == new_chunks_size + assert updated_settings["data_files_size_in_mb"] == new_data_size + assert updated_settings["video_files_size_in_mb"] == new_video_size + + # Test updating individual settings + dataset.meta.update_chunk_settings(chunks_size=1500) + settings_after_partial = dataset.meta.get_chunk_settings() + assert settings_after_partial["chunks_size"] == 1500 + assert settings_after_partial["data_files_size_in_mb"] == new_data_size + assert settings_after_partial["video_files_size_in_mb"] == new_video_size + + # Test updating only data file size + dataset.meta.update_chunk_settings(data_files_size_in_mb=150) + settings_after_data = dataset.meta.get_chunk_settings() + assert settings_after_data["chunks_size"] == 1500 + assert settings_after_data["data_files_size_in_mb"] == 150 + assert settings_after_data["video_files_size_in_mb"] == new_video_size + + # Test updating only video file size + dataset.meta.update_chunk_settings(video_files_size_in_mb=800) + settings_after_video = dataset.meta.get_chunk_settings() + assert settings_after_video["chunks_size"] == 1500 + assert settings_after_video["data_files_size_in_mb"] == 150 + assert settings_after_video["video_files_size_in_mb"] == 800 + + # Test that settings persist in the info file + info_path = dataset.root / "meta" / "info.json" + assert info_path.exists() + + # Verify the underlying metadata properties + assert dataset.meta.chunks_size == 1500 + assert dataset.meta.data_files_size_in_mb == 150 + assert dataset.meta.video_files_size_in_mb == 800 + + # Test error handling for invalid values + with pytest.raises(ValueError, match="chunks_size must be positive"): + dataset.meta.update_chunk_settings(chunks_size=0) + + with pytest.raises(ValueError, match="chunks_size must be positive"): + dataset.meta.update_chunk_settings(chunks_size=-100) + + with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"): + dataset.meta.update_chunk_settings(data_files_size_in_mb=0) + + with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"): + dataset.meta.update_chunk_settings(data_files_size_in_mb=-50) + + with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"): + dataset.meta.update_chunk_settings(video_files_size_in_mb=0) + + with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"): + dataset.meta.update_chunk_settings(video_files_size_in_mb=-200) + + # Test calling with None values (should not change anything) + settings_before_none = dataset.meta.get_chunk_settings() + dataset.meta.update_chunk_settings( + chunks_size=None, data_files_size_in_mb=None, video_files_size_in_mb=None + ) + settings_after_none = dataset.meta.get_chunk_settings() + assert settings_before_none == settings_after_none + + # Test metadata direct access + meta_settings = dataset.meta.get_chunk_settings() + assert meta_settings == dataset.meta.get_chunk_settings() + + # Test updating via metadata directly + dataset.meta.update_chunk_settings(chunks_size=3000) + assert dataset.meta.get_chunk_settings()["chunks_size"] == 3000 + + +def test_update_chunk_settings_video_dataset(tmp_path): + """Test update_chunk_settings with a video dataset to ensure video-specific logic works.""" + features = { + "observation.images.cam": { + "dtype": "video", + "shape": (480, 640, 3), + "names": ["height", "width", "channels"], + }, + "action": {"dtype": "float32", "shape": (6,), "names": ["j1", "j2", "j3", "j4", "j5", "j6"]}, + } + + # Create video dataset + dataset = LeRobotDataset.create( + repo_id=DUMMY_REPO_ID, fps=30, features=features, root=tmp_path / "video_test", use_videos=True + ) + + # Test that video-specific settings work + original_video_size = dataset.meta.get_chunk_settings()["video_files_size_in_mb"] + new_video_size = original_video_size * 2 + + dataset.meta.update_chunk_settings(video_files_size_in_mb=new_video_size) + assert dataset.meta.get_chunk_settings()["video_files_size_in_mb"] == new_video_size + assert dataset.meta.video_files_size_in_mb == new_video_size