mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-24 13:09:43 +00:00
feat(datasets): Add info operation to lerobot-edit-dataset command (#2917)
* Add New featrue to lerobot_edit_datset.py that show dataset information. * Fix to draccus error when happen give only --operation.type=info * Updating test and documents regarding lerobot-edit-dataset info function. * Updating documents regarding lerobot-edit-dataset extract function. option name in document is mistake. * feat(datasets): Update to align formatting with pre-commit.(#2917) Update to align formatting by pre-commit. --------- Co-authored-by: Caroline Pascal <caroline8.pascal@gmail.com>
This commit is contained in:
@@ -12,6 +12,7 @@ LeRobot provides several utilities for manipulating datasets:
|
|||||||
4. **Add Features** - Add new features to a dataset
|
4. **Add Features** - Add new features to a dataset
|
||||||
5. **Remove Features** - Remove features from a dataset
|
5. **Remove Features** - Remove features from a dataset
|
||||||
6. **Convert to Video** - Convert image-based datasets to video format for efficient storage
|
6. **Convert to Video** - Convert image-based datasets to video format for efficient storage
|
||||||
|
7. **Show the Info of Datasets** - Show the summary of datasets information such as number of episode etc.
|
||||||
|
|
||||||
The core implementation is in `lerobot.datasets.dataset_tools`.
|
The core implementation is in `lerobot.datasets.dataset_tools`.
|
||||||
An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`.
|
An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`.
|
||||||
@@ -156,6 +157,30 @@ lerobot-edit-dataset \
|
|||||||
|
|
||||||
**Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). All episodes, stats, and tasks are preserved.
|
**Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). All episodes, stats, and tasks are preserved.
|
||||||
|
|
||||||
|
### Show the information of datasets
|
||||||
|
|
||||||
|
Show the information of datasets such as number of episode, number of frame, File size and so on.
|
||||||
|
No change will be made to the dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
|
||||||
|
# Show dataset information without feature details
|
||||||
|
lerobot-edit-dataset \
|
||||||
|
--repo_id lerobot/pusht_image \
|
||||||
|
--operation.type info \
|
||||||
|
|
||||||
|
# Show dataset information with feature details
|
||||||
|
lerobot-edit-dataset \
|
||||||
|
--repo_id lerobot/pusht_image \
|
||||||
|
--operation.type info \
|
||||||
|
--operation.show_features true
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
- `parameters`: The flag to control show or no show dataset information with feature details.(default=false)
|
||||||
|
|
||||||
### Push to Hub
|
### Push to Hub
|
||||||
|
|
||||||
Add the `--push_to_hub true` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub:
|
Add the `--push_to_hub true` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub:
|
||||||
|
|||||||
@@ -104,6 +104,18 @@ Convert image dataset to video format and push to hub:
|
|||||||
--operation.type convert_image_to_video \
|
--operation.type convert_image_to_video \
|
||||||
--push_to_hub true
|
--push_to_hub true
|
||||||
|
|
||||||
|
Show dataset information:
|
||||||
|
python -m lerobot.scripts.lerobot_edit_dataset \
|
||||||
|
--repo_id lerobot/pusht_image \
|
||||||
|
--operation.type info \
|
||||||
|
--operation.show_features true
|
||||||
|
|
||||||
|
Show dataset information without feature details:
|
||||||
|
python -m lerobot.scripts.lerobot_edit_dataset \
|
||||||
|
--repo_id lerobot/pusht_image \
|
||||||
|
--operation.type info \
|
||||||
|
--operation.show_features false
|
||||||
|
|
||||||
Using JSON config file:
|
Using JSON config file:
|
||||||
python -m lerobot.scripts.lerobot_edit_dataset \
|
python -m lerobot.scripts.lerobot_edit_dataset \
|
||||||
--config_path path/to/edit_config.json
|
--config_path path/to/edit_config.json
|
||||||
@@ -112,6 +124,7 @@ Using JSON config file:
|
|||||||
import abc
|
import abc
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -184,6 +197,13 @@ class ConvertImageToVideoConfig(OperationConfig):
|
|||||||
max_frames_per_batch: int | None = None
|
max_frames_per_batch: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@OperationConfig.register_subclass("info")
|
||||||
|
@dataclass
|
||||||
|
class InfoConfig(OperationConfig):
|
||||||
|
type: str = "info"
|
||||||
|
show_features: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class EditDatasetConfig:
|
class EditDatasetConfig:
|
||||||
repo_id: str
|
repo_id: str
|
||||||
@@ -436,6 +456,49 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
|
|||||||
logging.info("Dataset saved locally (not pushed to hub)")
|
logging.info("Dataset saved locally (not pushed to hub)")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_dataset_size(repo_path):
|
||||||
|
import os
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
with os.scandir(repo_path) as it:
|
||||||
|
for entry in it:
|
||||||
|
if entry.is_file():
|
||||||
|
total += entry.stat().st_size
|
||||||
|
elif entry.is_dir():
|
||||||
|
total += _get_dataset_size(entry.path)
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
def handle_info(cfg: EditDatasetConfig):
|
||||||
|
if not isinstance(cfg.operation, InfoConfig):
|
||||||
|
raise ValueError("Operation config must be InfoConfig")
|
||||||
|
|
||||||
|
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
|
||||||
|
sys.stdout.write(f"======Info {dataset.meta.repo_id}\n")
|
||||||
|
sys.stdout.write(f"Repository ID: {dataset.meta.repo_id} \n")
|
||||||
|
sys.stdout.write(f"Total episode: {dataset.meta.total_episodes} \n")
|
||||||
|
sys.stdout.write(f"Total task: {dataset.meta.total_tasks} \n")
|
||||||
|
sys.stdout.write(f"Total frame(Actual Count): {dataset.meta.total_frames}({len(dataset)}) \n")
|
||||||
|
sys.stdout.write(
|
||||||
|
f"Average frame per episode: {dataset.meta.total_frames / dataset.meta.total_episodes:.1f}\n"
|
||||||
|
)
|
||||||
|
sys.stdout.write(
|
||||||
|
f"Average episode time(sec): {(dataset.meta.total_frames / dataset.meta.total_episodes) / dataset.meta.fps:.1f}\n"
|
||||||
|
)
|
||||||
|
sys.stdout.write(f"FPS: {dataset.meta.fps}\n")
|
||||||
|
|
||||||
|
total_file_size = _get_dataset_size(dataset.root)
|
||||||
|
sys.stdout.write(f"Size: {total_file_size / (1024 * 1024):.1f} MB\n")
|
||||||
|
if cfg.operation.show_features:
|
||||||
|
import json
|
||||||
|
|
||||||
|
feature_dump_str = json.dumps(
|
||||||
|
dataset.meta.features, ensure_ascii=False, indent=4, sort_keys=True, separators=(",", ": ")
|
||||||
|
)
|
||||||
|
sys.stdout.write("Features:\n")
|
||||||
|
sys.stdout.write(f"{feature_dump_str}\n")
|
||||||
|
|
||||||
|
|
||||||
@parser.wrap()
|
@parser.wrap()
|
||||||
def edit_dataset(cfg: EditDatasetConfig) -> None:
|
def edit_dataset(cfg: EditDatasetConfig) -> None:
|
||||||
operation_type = cfg.operation.type
|
operation_type = cfg.operation.type
|
||||||
@@ -452,6 +515,8 @@ def edit_dataset(cfg: EditDatasetConfig) -> None:
|
|||||||
handle_modify_tasks(cfg)
|
handle_modify_tasks(cfg)
|
||||||
elif operation_type == "convert_image_to_video":
|
elif operation_type == "convert_image_to_video":
|
||||||
handle_convert_image_to_video(cfg)
|
handle_convert_image_to_video(cfg)
|
||||||
|
elif operation_type == "info":
|
||||||
|
handle_info(cfg)
|
||||||
else:
|
else:
|
||||||
available = ", ".join(OperationConfig.get_known_choices())
|
available = ", ".join(OperationConfig.get_known_choices())
|
||||||
raise ValueError(f"Unknown operation: {operation_type}\nAvailable operations: {available}")
|
raise ValueError(f"Unknown operation: {operation_type}\nAvailable operations: {available}")
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from lerobot.scripts.lerobot_edit_dataset import (
|
|||||||
ConvertImageToVideoConfig,
|
ConvertImageToVideoConfig,
|
||||||
DeleteEpisodesConfig,
|
DeleteEpisodesConfig,
|
||||||
EditDatasetConfig,
|
EditDatasetConfig,
|
||||||
|
InfoConfig,
|
||||||
MergeConfig,
|
MergeConfig,
|
||||||
ModifyTasksConfig,
|
ModifyTasksConfig,
|
||||||
OperationConfig,
|
OperationConfig,
|
||||||
@@ -46,6 +47,7 @@ class TestOperationTypeParsing:
|
|||||||
("remove_feature", RemoveFeatureConfig),
|
("remove_feature", RemoveFeatureConfig),
|
||||||
("modify_tasks", ModifyTasksConfig),
|
("modify_tasks", ModifyTasksConfig),
|
||||||
("convert_image_to_video", ConvertImageToVideoConfig),
|
("convert_image_to_video", ConvertImageToVideoConfig),
|
||||||
|
("info", InfoConfig),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_operation_type_resolves_correct_class(self, type_name, expected_cls):
|
def test_operation_type_resolves_correct_class(self, type_name, expected_cls):
|
||||||
@@ -63,6 +65,7 @@ class TestOperationTypeParsing:
|
|||||||
("remove_feature", RemoveFeatureConfig),
|
("remove_feature", RemoveFeatureConfig),
|
||||||
("modify_tasks", ModifyTasksConfig),
|
("modify_tasks", ModifyTasksConfig),
|
||||||
("convert_image_to_video", ConvertImageToVideoConfig),
|
("convert_image_to_video", ConvertImageToVideoConfig),
|
||||||
|
("info", InfoConfig),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_get_choice_name_roundtrips(self, type_name, expected_cls):
|
def test_get_choice_name_roundtrips(self, type_name, expected_cls):
|
||||||
|
|||||||
Reference in New Issue
Block a user