mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 11:39:50 +00:00
add update_chunk_settings method for LeRobotDatasetMetadata. Introduce tests for chunk settings updates and validation of parameters.
This commit is contained in:
@@ -152,11 +152,11 @@ def update_meta_data(
|
|||||||
def aggregate_datasets(
|
def aggregate_datasets(
|
||||||
repo_ids: list[str],
|
repo_ids: list[str],
|
||||||
aggr_repo_id: str,
|
aggr_repo_id: str,
|
||||||
roots: list[Path] = None,
|
roots: list[Path] | None = None,
|
||||||
aggr_root: Path = None,
|
aggr_root: Path | None = None,
|
||||||
data_files_size_in_mb: float = None,
|
data_files_size_in_mb: float | None = None,
|
||||||
video_files_size_in_mb: float = None,
|
video_files_size_in_mb: float | None = None,
|
||||||
chunk_size: int = None,
|
chunk_size: int | None = None,
|
||||||
):
|
):
|
||||||
"""Aggregates multiple LeRobot datasets into a single unified dataset.
|
"""Aggregates multiple LeRobot datasets into a single unified dataset.
|
||||||
|
|
||||||
|
|||||||
@@ -358,6 +358,53 @@ class LeRobotDatasetMetadata:
|
|||||||
video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
|
video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
|
||||||
self.info["features"][key]["info"] = get_video_info(video_path)
|
self.info["features"][key]["info"] = get_video_info(video_path)
|
||||||
|
|
||||||
|
def update_chunk_settings(
|
||||||
|
self,
|
||||||
|
chunks_size: int | None = None,
|
||||||
|
data_files_size_in_mb: int | None = None,
|
||||||
|
video_files_size_in_mb: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Update chunk and file size settings after dataset creation.
|
||||||
|
|
||||||
|
This allows users to customize storage organization without modifying the constructor.
|
||||||
|
These settings control how episodes are chunked and how large files can grow before
|
||||||
|
creating new ones.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks_size: Maximum number of files per chunk directory. If None, keeps current value.
|
||||||
|
data_files_size_in_mb: Maximum size for data parquet files in MB. If None, keeps current value.
|
||||||
|
video_files_size_in_mb: Maximum size for video files in MB. If None, keeps current value.
|
||||||
|
"""
|
||||||
|
if chunks_size is not None:
|
||||||
|
if chunks_size <= 0:
|
||||||
|
raise ValueError(f"chunks_size must be positive, got {chunks_size}")
|
||||||
|
self.info["chunks_size"] = chunks_size
|
||||||
|
|
||||||
|
if data_files_size_in_mb is not None:
|
||||||
|
if data_files_size_in_mb <= 0:
|
||||||
|
raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}")
|
||||||
|
self.info["data_files_size_in_mb"] = data_files_size_in_mb
|
||||||
|
|
||||||
|
if video_files_size_in_mb is not None:
|
||||||
|
if video_files_size_in_mb <= 0:
|
||||||
|
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
|
||||||
|
self.info["video_files_size_in_mb"] = video_files_size_in_mb
|
||||||
|
|
||||||
|
# Update the info file on disk
|
||||||
|
write_info(self.info, self.root)
|
||||||
|
|
||||||
|
def get_chunk_settings(self) -> dict[str, int]:
|
||||||
|
"""Get current chunk and file size settings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing chunks_size, data_files_size_in_mb, and video_files_size_in_mb.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"chunks_size": self.chunks_size,
|
||||||
|
"data_files_size_in_mb": self.data_files_size_in_mb,
|
||||||
|
"video_files_size_in_mb": self.video_files_size_in_mb,
|
||||||
|
}
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
feature_keys = list(self.features)
|
feature_keys = list(self.features)
|
||||||
return (
|
return (
|
||||||
|
|||||||
@@ -540,6 +540,9 @@ def create_empty_dataset_info(
|
|||||||
features: dict,
|
features: dict,
|
||||||
use_videos: bool,
|
use_videos: bool,
|
||||||
robot_type: str | None = None,
|
robot_type: str | None = None,
|
||||||
|
chunks_size: int | None = None,
|
||||||
|
data_files_size_in_mb: int | None = None,
|
||||||
|
video_files_size_in_mb: int | None = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
return {
|
return {
|
||||||
"codebase_version": codebase_version,
|
"codebase_version": codebase_version,
|
||||||
@@ -547,9 +550,9 @@ def create_empty_dataset_info(
|
|||||||
"total_episodes": 0,
|
"total_episodes": 0,
|
||||||
"total_frames": 0,
|
"total_frames": 0,
|
||||||
"total_tasks": 0,
|
"total_tasks": 0,
|
||||||
"chunks_size": DEFAULT_CHUNK_SIZE,
|
"chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
|
||||||
"data_files_size_in_mb": DEFAULT_DATA_FILE_SIZE_IN_MB,
|
"data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||||
"video_files_size_in_mb": DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
"video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||||
"fps": fps,
|
"fps": fps,
|
||||||
"splits": {},
|
"splits": {},
|
||||||
"data_path": DEFAULT_DATA_PATH,
|
"data_path": DEFAULT_DATA_PATH,
|
||||||
|
|||||||
@@ -35,6 +35,9 @@ from lerobot.datasets.lerobot_dataset import (
|
|||||||
MultiLeRobotDataset,
|
MultiLeRobotDataset,
|
||||||
)
|
)
|
||||||
from lerobot.datasets.utils import (
|
from lerobot.datasets.utils import (
|
||||||
|
DEFAULT_CHUNK_SIZE,
|
||||||
|
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||||
|
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||||
create_branch,
|
create_branch,
|
||||||
get_hf_features_from_features,
|
get_hf_features_from_features,
|
||||||
hf_transform_to_torch,
|
hf_transform_to_torch,
|
||||||
@@ -654,3 +657,135 @@ def test_check_cached_episodes_sufficient(tmp_path, lerobot_dataset_factory):
|
|||||||
# Test requesting a mix of available and unavailable episodes
|
# Test requesting a mix of available and unavailable episodes
|
||||||
sparse_dataset.episodes = [0, 1, 2]
|
sparse_dataset.episodes = [0, 1, 2]
|
||||||
assert sparse_dataset._check_cached_episodes_sufficient() is False
|
assert sparse_dataset._check_cached_episodes_sufficient() is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_chunk_settings(tmp_path, empty_lerobot_dataset_factory):
|
||||||
|
"""Test the update_chunk_settings functionality for both LeRobotDataset and LeRobotDatasetMetadata."""
|
||||||
|
features = {
|
||||||
|
"observation.state": {
|
||||||
|
"dtype": "float32",
|
||||||
|
"shape": (6,),
|
||||||
|
"names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"],
|
||||||
|
},
|
||||||
|
"action": {
|
||||||
|
"dtype": "float32",
|
||||||
|
"shape": (6,),
|
||||||
|
"names": ["shoulder_pan", "shoulder_lift", "elbow", "wrist_1", "wrist_2", "wrist_3"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create dataset with default chunk settings
|
||||||
|
dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
|
||||||
|
|
||||||
|
# Test initial default values
|
||||||
|
initial_settings = dataset.meta.get_chunk_settings()
|
||||||
|
assert initial_settings["chunks_size"] == DEFAULT_CHUNK_SIZE
|
||||||
|
assert initial_settings["data_files_size_in_mb"] == DEFAULT_DATA_FILE_SIZE_IN_MB
|
||||||
|
assert initial_settings["video_files_size_in_mb"] == DEFAULT_VIDEO_FILE_SIZE_IN_MB
|
||||||
|
|
||||||
|
# Test updating all settings at once
|
||||||
|
new_chunks_size = 2000
|
||||||
|
new_data_size = 200
|
||||||
|
new_video_size = 1000
|
||||||
|
|
||||||
|
dataset.meta.update_chunk_settings(
|
||||||
|
chunks_size=new_chunks_size,
|
||||||
|
data_files_size_in_mb=new_data_size,
|
||||||
|
video_files_size_in_mb=new_video_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify settings were updated
|
||||||
|
updated_settings = dataset.meta.get_chunk_settings()
|
||||||
|
assert updated_settings["chunks_size"] == new_chunks_size
|
||||||
|
assert updated_settings["data_files_size_in_mb"] == new_data_size
|
||||||
|
assert updated_settings["video_files_size_in_mb"] == new_video_size
|
||||||
|
|
||||||
|
# Test updating individual settings
|
||||||
|
dataset.meta.update_chunk_settings(chunks_size=1500)
|
||||||
|
settings_after_partial = dataset.meta.get_chunk_settings()
|
||||||
|
assert settings_after_partial["chunks_size"] == 1500
|
||||||
|
assert settings_after_partial["data_files_size_in_mb"] == new_data_size
|
||||||
|
assert settings_after_partial["video_files_size_in_mb"] == new_video_size
|
||||||
|
|
||||||
|
# Test updating only data file size
|
||||||
|
dataset.meta.update_chunk_settings(data_files_size_in_mb=150)
|
||||||
|
settings_after_data = dataset.meta.get_chunk_settings()
|
||||||
|
assert settings_after_data["chunks_size"] == 1500
|
||||||
|
assert settings_after_data["data_files_size_in_mb"] == 150
|
||||||
|
assert settings_after_data["video_files_size_in_mb"] == new_video_size
|
||||||
|
|
||||||
|
# Test updating only video file size
|
||||||
|
dataset.meta.update_chunk_settings(video_files_size_in_mb=800)
|
||||||
|
settings_after_video = dataset.meta.get_chunk_settings()
|
||||||
|
assert settings_after_video["chunks_size"] == 1500
|
||||||
|
assert settings_after_video["data_files_size_in_mb"] == 150
|
||||||
|
assert settings_after_video["video_files_size_in_mb"] == 800
|
||||||
|
|
||||||
|
# Test that settings persist in the info file
|
||||||
|
info_path = dataset.root / "meta" / "info.json"
|
||||||
|
assert info_path.exists()
|
||||||
|
|
||||||
|
# Verify the underlying metadata properties
|
||||||
|
assert dataset.meta.chunks_size == 1500
|
||||||
|
assert dataset.meta.data_files_size_in_mb == 150
|
||||||
|
assert dataset.meta.video_files_size_in_mb == 800
|
||||||
|
|
||||||
|
# Test error handling for invalid values
|
||||||
|
with pytest.raises(ValueError, match="chunks_size must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(chunks_size=0)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="chunks_size must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(chunks_size=-100)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(data_files_size_in_mb=0)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="data_files_size_in_mb must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(data_files_size_in_mb=-50)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(video_files_size_in_mb=0)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="video_files_size_in_mb must be positive"):
|
||||||
|
dataset.meta.update_chunk_settings(video_files_size_in_mb=-200)
|
||||||
|
|
||||||
|
# Test calling with None values (should not change anything)
|
||||||
|
settings_before_none = dataset.meta.get_chunk_settings()
|
||||||
|
dataset.meta.update_chunk_settings(
|
||||||
|
chunks_size=None, data_files_size_in_mb=None, video_files_size_in_mb=None
|
||||||
|
)
|
||||||
|
settings_after_none = dataset.meta.get_chunk_settings()
|
||||||
|
assert settings_before_none == settings_after_none
|
||||||
|
|
||||||
|
# Test metadata direct access
|
||||||
|
meta_settings = dataset.meta.get_chunk_settings()
|
||||||
|
assert meta_settings == dataset.meta.get_chunk_settings()
|
||||||
|
|
||||||
|
# Test updating via metadata directly
|
||||||
|
dataset.meta.update_chunk_settings(chunks_size=3000)
|
||||||
|
assert dataset.meta.get_chunk_settings()["chunks_size"] == 3000
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_chunk_settings_video_dataset(tmp_path):
|
||||||
|
"""Test update_chunk_settings with a video dataset to ensure video-specific logic works."""
|
||||||
|
features = {
|
||||||
|
"observation.images.cam": {
|
||||||
|
"dtype": "video",
|
||||||
|
"shape": (480, 640, 3),
|
||||||
|
"names": ["height", "width", "channels"],
|
||||||
|
},
|
||||||
|
"action": {"dtype": "float32", "shape": (6,), "names": ["j1", "j2", "j3", "j4", "j5", "j6"]},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create video dataset
|
||||||
|
dataset = LeRobotDataset.create(
|
||||||
|
repo_id=DUMMY_REPO_ID, fps=30, features=features, root=tmp_path / "video_test", use_videos=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test that video-specific settings work
|
||||||
|
original_video_size = dataset.meta.get_chunk_settings()["video_files_size_in_mb"]
|
||||||
|
new_video_size = original_video_size * 2
|
||||||
|
|
||||||
|
dataset.meta.update_chunk_settings(video_files_size_in_mb=new_video_size)
|
||||||
|
assert dataset.meta.get_chunk_settings()["video_files_size_in_mb"] == new_video_size
|
||||||
|
assert dataset.meta.video_files_size_in_mb == new_video_size
|
||||||
|
|||||||
Reference in New Issue
Block a user