add update_chunk_settings method for LeRobotDatasetMetadata. Introduce tests for chunk settings updates and validation of parameters.

This commit is contained in:
Michel Aractingi
2025-08-18 00:00:23 +02:00
parent c7a3b02625
commit db36f01e8b
4 changed files with 193 additions and 8 deletions
+5 -5
View File
@@ -152,11 +152,11 @@ def update_meta_data(
def aggregate_datasets( def aggregate_datasets(
repo_ids: list[str], repo_ids: list[str],
aggr_repo_id: str, aggr_repo_id: str,
roots: list[Path] = None, roots: list[Path] | None = None,
aggr_root: Path = None, aggr_root: Path | None = None,
data_files_size_in_mb: float = None, data_files_size_in_mb: float | None = None,
video_files_size_in_mb: float = None, video_files_size_in_mb: float | None = None,
chunk_size: int = None, chunk_size: int | None = None,
): ):
"""Aggregates multiple LeRobot datasets into a single unified dataset. """Aggregates multiple LeRobot datasets into a single unified dataset.
+47
View File
@@ -358,6 +358,53 @@ class LeRobotDatasetMetadata:
video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key) video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
self.info["features"][key]["info"] = get_video_info(video_path) self.info["features"][key]["info"] = get_video_info(video_path)
def update_chunk_settings(
self,
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
) -> None:
"""Update chunk and file size settings after dataset creation.
This allows users to customize storage organization without modifying the constructor.
These settings control how episodes are chunked and how large files can grow before
creating new ones.
Args:
chunks_size: Maximum number of files per chunk directory. If None, keeps current value.
data_files_size_in_mb: Maximum size for data parquet files in MB. If None, keeps current value.
video_files_size_in_mb: Maximum size for video files in MB. If None, keeps current value.
"""
if chunks_size is not None:
if chunks_size <= 0:
raise ValueError(f"chunks_size must be positive, got {chunks_size}")
self.info["chunks_size"] = chunks_size
if data_files_size_in_mb is not None:
if data_files_size_in_mb <= 0:
raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}")
self.info["data_files_size_in_mb"] = data_files_size_in_mb
if video_files_size_in_mb is not None:
if video_files_size_in_mb <= 0:
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
self.info["video_files_size_in_mb"] = video_files_size_in_mb
# Update the info file on disk
write_info(self.info, self.root)
def get_chunk_settings(self) -> dict[str, int]:
"""Get current chunk and file size settings.
Returns:
Dict containing chunks_size, data_files_size_in_mb, and video_files_size_in_mb.
"""
return {
"chunks_size": self.chunks_size,
"data_files_size_in_mb": self.data_files_size_in_mb,
"video_files_size_in_mb": self.video_files_size_in_mb,
}
def __repr__(self): def __repr__(self):
feature_keys = list(self.features) feature_keys = list(self.features)
return ( return (
+6 -3
View File
@@ -540,6 +540,9 @@ def create_empty_dataset_info(
features: dict, features: dict,
use_videos: bool, use_videos: bool,
robot_type: str | None = None, robot_type: str | None = None,
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
) -> dict: ) -> dict:
return { return {
"codebase_version": codebase_version, "codebase_version": codebase_version,
@@ -547,9 +550,9 @@ def create_empty_dataset_info(
"total_episodes": 0, "total_episodes": 0,
"total_frames": 0, "total_frames": 0,
"total_tasks": 0, "total_tasks": 0,
"chunks_size": DEFAULT_CHUNK_SIZE, "chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
"data_files_size_in_mb": DEFAULT_DATA_FILE_SIZE_IN_MB, "data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
"video_files_size_in_mb": DEFAULT_VIDEO_FILE_SIZE_IN_MB, "video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
"fps": fps, "fps": fps,
"splits": {}, "splits": {},
"data_path": DEFAULT_DATA_PATH, "data_path": DEFAULT_DATA_PATH,
+135
View File
@@ -35,6 +35,9 @@ from lerobot.datasets.lerobot_dataset import (
MultiLeRobotDataset, MultiLeRobotDataset,
) )
from lerobot.datasets.utils import ( from lerobot.datasets.utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
create_branch, create_branch,
get_hf_features_from_features, get_hf_features_from_features,
hf_transform_to_torch, hf_transform_to_torch,
@@ -654,3 +657,135 @@ def test_check_cached_episodes_sufficient(tmp_path, lerobot_dataset_factory):
# Test requesting a mix of available and unavailable episodes # Test requesting a mix of available and unavailable episodes
sparse_dataset.episodes = [0, 1, 2] sparse_dataset.episodes = [0, 1, 2]
assert sparse_dataset._check_cached_episodes_sufficient() is False assert sparse_dataset._check_cached_episodes_sufficient() is False
def test_update_chunk_settings(tmp_path, empty_lerobot_dataset_factory):
"""Test the update_chunk_settings functionality for both LeRobotDataset and LeRobotDatasetMetadata."""
features = {
"observation.state": {
"dtype": "float32",
"shape": (6,),
"names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"],
},
"action": {
"dtype": "float32",
"shape": (6,),
"names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"],
},
}
# Create dataset with default chunk settings
dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
# Test initial default values
initial_settings = dataset.meta.get_chunk_settings()
assert initial_settings["chunks_size"] == DEFAULT_CHUNK_SIZE
assert initial_settings["data_files_size_in_mb"] == DEFAULT_DATA_FILE_SIZE_IN_MB
assert initial_settings["video_files_size_in_mb"] == DEFAULT_VIDEO_FILE_SIZE_IN_MB
# Test updating all settings at once
new_chunks_size = 2000
new_data_size = 200
new_video_size = 1000
dataset.meta.update_chunk_settings(
chunks_size=new_chunks_size,
data_files_size_in_mb=new_data_size,
video_files_size_in_mb=new_video_size,
)
# Verify settings were updated
updated_settings = dataset.meta.get_chunk_settings()
assert updated_settings["chunks_size"] == new_chunks_size
assert updated_settings["data_files_size_in_mb"] == new_data_size
assert updated_settings["video_files_size_in_mb"] == new_video_size
# Test updating individual settings
dataset.meta.update_chunk_settings(chunks_size=1500)
settings_after_partial = dataset.meta.get_chunk_settings()
assert settings_after_partial["chunks_size"] == 1500
assert settings_after_partial["data_files_size_in_mb"] == new_data_size
assert settings_after_partial["video_files_size_in_mb"] == new_video_size
# Test updating only data file size
dataset.meta.update_chunk_settings(data_files_size_in_mb=150)
settings_after_data = dataset.meta.get_chunk_settings()
assert settings_after_data["chunks_size"] == 1500
assert settings_after_data["data_files_size_in_mb"] == 150
assert settings_after_data["video_files_size_in_mb"] == new_video_size
# Test updating only video file size
dataset.meta.update_chunk_settings(video_files_size_in_mb=800)
settings_after_video = dataset.meta.get_chunk_settings()
assert settings_after_video["chunks_size"] == 1500
assert settings_after_video["data_files_size_in_mb"] == 150
assert settings_after_video["video_files_size_in_mb"] == 800
# Test that settings persist in the info file
info_path = dataset.root / "meta" / "info.json"
assert info_path.exists()
# Verify the underlying metadata properties
assert dataset.meta.chunks_size == 1500
assert dataset.meta.data_files_size_in_mb == 150
assert dataset.meta.video_files_size_in_mb == 800
# Test error handling for invalid values
with pytest.raises(ValueError, match="chunks_size must be positive"):
dataset.meta.update_chunk_settings(chunks_size=0)
with pytest.raises(ValueError, match="chunks_size must be positive"):
dataset.meta.update_chunk_settings(chunks_size=-100)
with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"):
dataset.meta.update_chunk_settings(data_files_size_in_mb=0)
with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"):
dataset.meta.update_chunk_settings(data_files_size_in_mb=-50)
with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"):
dataset.meta.update_chunk_settings(video_files_size_in_mb=0)
with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"):
dataset.meta.update_chunk_settings(video_files_size_in_mb=-200)
# Test calling with None values (should not change anything)
settings_before_none = dataset.meta.get_chunk_settings()
dataset.meta.update_chunk_settings(
chunks_size=None, data_files_size_in_mb=None, video_files_size_in_mb=None
)
settings_after_none = dataset.meta.get_chunk_settings()
assert settings_before_none == settings_after_none
# Test metadata direct access
meta_settings = dataset.meta.get_chunk_settings()
assert meta_settings == dataset.meta.get_chunk_settings()
# Test updating via metadata directly
dataset.meta.update_chunk_settings(chunks_size=3000)
assert dataset.meta.get_chunk_settings()["chunks_size"] == 3000
def test_update_chunk_settings_video_dataset(tmp_path):
"""Test update_chunk_settings with a video dataset to ensure video-specific logic works."""
features = {
"observation.images.cam": {
"dtype": "video",
"shape": (480, 640, 3),
"names": ["height", "width", "channels"],
},
"action": {"dtype": "float32", "shape": (6,), "names": ["j1", "j2", "j3", "j4", "j5", "j6"]},
}
# Create video dataset
dataset = LeRobotDataset.create(
repo_id=DUMMY_REPO_ID, fps=30, features=features, root=tmp_path / "video_test", use_videos=True
)
# Test that video-specific settings work
original_video_size = dataset.meta.get_chunk_settings()["video_files_size_in_mb"]
new_video_size = original_video_size * 2
dataset.meta.update_chunk_settings(video_files_size_in_mb=new_video_size)
assert dataset.meta.get_chunk_settings()["video_files_size_in_mb"] == new_video_size
assert dataset.meta.video_files_size_in_mb == new_video_size