diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index 9e662604e..f7fc9be20 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -12,6 +12,7 @@ LeRobot provides several utilities for manipulating datasets: 4. **Add Features** - Add new features to a dataset 5. **Remove Features** - Remove features from a dataset 6. **Convert to Video** - Convert image-based datasets to video format for efficient storage +7. **Show the Info of Datasets** - Show the summary of datasets information such as number of episode etc. The core implementation is in `lerobot.datasets.dataset_tools`. An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`. @@ -156,6 +157,30 @@ lerobot-edit-dataset \ **Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). All episodes, stats, and tasks are preserved. +### Show the information of datasets + +Show the information of datasets such as number of episode, number of frame, File size and so on. +No change will be made to the dataset + +```bash + +# Show dataset information without feature details +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type info \ + +# Show dataset information with feature details +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type info \ + --operation.show_features true + +``` + +**Parameters:** + +- `parameters`: The flag to control show or no show dataset information with feature details.(default=false) + ### Push to Hub Add the `--push_to_hub true` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub: diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index 7c222ac6c..06e256fa2 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -104,6 +104,18 @@ Convert image dataset to video format and push to hub: --operation.type convert_image_to_video \ --push_to_hub true +Show dataset information: + python -m lerobot.scripts.lerobot_edit_dataset \ + --repo_id lerobot/pusht_image \ + --operation.type info \ + --operation.show_features true + +Show dataset information without feature details: + python -m lerobot.scripts.lerobot_edit_dataset \ + --repo_id lerobot/pusht_image \ + --operation.type info \ + --operation.show_features false + Using JSON config file: python -m lerobot.scripts.lerobot_edit_dataset \ --config_path path/to/edit_config.json @@ -112,6 +124,7 @@ Using JSON config file: import abc import logging import shutil +import sys from dataclasses import dataclass from pathlib import Path @@ -184,6 +197,13 @@ class ConvertImageToVideoConfig(OperationConfig): max_frames_per_batch: int | None = None +@OperationConfig.register_subclass("info") +@dataclass +class InfoConfig(OperationConfig): + type: str = "info" + show_features: bool = False + + @dataclass class EditDatasetConfig: repo_id: str @@ -436,6 +456,49 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: logging.info("Dataset saved locally (not pushed to hub)") +def _get_dataset_size(repo_path): + import os + + total = 0 + with os.scandir(repo_path) as it: + for entry in it: + if entry.is_file(): + total += entry.stat().st_size + elif entry.is_dir(): + total += _get_dataset_size(entry.path) + return total + + +def handle_info(cfg: EditDatasetConfig): + if not isinstance(cfg.operation, InfoConfig): + raise ValueError("Operation config must be InfoConfig") + + dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) + sys.stdout.write(f"======Info {dataset.meta.repo_id}\n") + sys.stdout.write(f"Repository ID: {dataset.meta.repo_id} \n") + sys.stdout.write(f"Total episode: {dataset.meta.total_episodes} \n") + sys.stdout.write(f"Total task: {dataset.meta.total_tasks} \n") + sys.stdout.write(f"Total frame(Actual Count): {dataset.meta.total_frames}({len(dataset)}) \n") + sys.stdout.write( + f"Average frame per episode: {dataset.meta.total_frames / dataset.meta.total_episodes:.1f}\n" + ) + sys.stdout.write( + f"Average episode time(sec): {(dataset.meta.total_frames / dataset.meta.total_episodes) / dataset.meta.fps:.1f}\n" + ) + sys.stdout.write(f"FPS: {dataset.meta.fps}\n") + + total_file_size = _get_dataset_size(dataset.root) + sys.stdout.write(f"Size: {total_file_size / (1024 * 1024):.1f} MB\n") + if cfg.operation.show_features: + import json + + feature_dump_str = json.dumps( + dataset.meta.features, ensure_ascii=False, indent=4, sort_keys=True, separators=(",", ": ") + ) + sys.stdout.write("Features:\n") + sys.stdout.write(f"{feature_dump_str}\n") + + @parser.wrap() def edit_dataset(cfg: EditDatasetConfig) -> None: operation_type = cfg.operation.type @@ -452,6 +515,8 @@ def edit_dataset(cfg: EditDatasetConfig) -> None: handle_modify_tasks(cfg) elif operation_type == "convert_image_to_video": handle_convert_image_to_video(cfg) + elif operation_type == "info": + handle_info(cfg) else: available = ", ".join(OperationConfig.get_known_choices()) raise ValueError(f"Unknown operation: {operation_type}\nAvailable operations: {available}") diff --git a/tests/scripts/test_edit_dataset_parsing.py b/tests/scripts/test_edit_dataset_parsing.py index bf7386b52..8800b92ee 100644 --- a/tests/scripts/test_edit_dataset_parsing.py +++ b/tests/scripts/test_edit_dataset_parsing.py @@ -21,6 +21,7 @@ from lerobot.scripts.lerobot_edit_dataset import ( ConvertImageToVideoConfig, DeleteEpisodesConfig, EditDatasetConfig, + InfoConfig, MergeConfig, ModifyTasksConfig, OperationConfig, @@ -46,6 +47,7 @@ class TestOperationTypeParsing: ("remove_feature", RemoveFeatureConfig), ("modify_tasks", ModifyTasksConfig), ("convert_image_to_video", ConvertImageToVideoConfig), + ("info", InfoConfig), ], ) def test_operation_type_resolves_correct_class(self, type_name, expected_cls): @@ -63,6 +65,7 @@ class TestOperationTypeParsing: ("remove_feature", RemoveFeatureConfig), ("modify_tasks", ModifyTasksConfig), ("convert_image_to_video", ConvertImageToVideoConfig), + ("info", InfoConfig), ], ) def test_get_choice_name_roundtrips(self, type_name, expected_cls):