diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index b62d7d959..c900d7479 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -89,8 +89,8 @@ def delete_episodes( Args: dataset: The source LeRobotDataset. episode_indices: List of episode indices to delete. - output_dir: Directory to save the new dataset. If None, uses default location. - repo_id: Repository ID for the new dataset. If None, appends "_modified" to original. + output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. + repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. """ if not episode_indices: raise ValueError("No episodes to delete") @@ -152,7 +152,7 @@ def split_dataset( dataset: The source LeRobotDataset to split. splits: Either a dict mapping split names to episode indices, or a dict mapping split names to fractions (must sum to <= 1.0). - output_dir: Base directory for output datasets. If None, uses default location. + output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Examples: Split by specific episodes @@ -243,8 +243,8 @@ def merge_datasets( Args: datasets: List of LeRobotDatasets to merge. - output_repo_id: Repository ID for the merged dataset. - output_dir: Directory to save the merged dataset. If None, uses default location. + output_repo_id: Merged dataset identifier. + output_dir: Root directory where the merged dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/output_repo_id. """ if not datasets: raise ValueError("No datasets to merge") @@ -288,8 +288,8 @@ def modify_features( dataset: The source LeRobotDataset. add_features: Optional dict mapping feature names to (feature_values, feature_info) tuples. remove_features: Optional feature name(s) to remove. Can be a single string or list. - output_dir: Directory to save the new dataset. If None, uses default location. - repo_id: Repository ID for the new dataset. If None, appends "_modified" to original. + output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. + repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. Returns: New dataset with features modified. @@ -390,8 +390,8 @@ def add_features( Args: dataset: The source LeRobotDataset. features: Dictionary mapping feature names to (feature_values, feature_info) tuples. - output_dir: Directory to save the new dataset. If None, uses default location. - repo_id: Repository ID for the new dataset. If None, appends "_modified" to original. + output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. + repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. Returns: New dataset with all features added. @@ -427,8 +427,8 @@ def remove_feature( Args: dataset: The source LeRobotDataset. feature_names: Name(s) of features to remove. Can be a single string or list. - output_dir: Directory to save the new dataset. If None, uses default location. - repo_id: Repository ID for the new dataset. If None, appends "_modified" to original. + output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. + repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. Returns: New dataset with features removed. @@ -1529,7 +1529,7 @@ def modify_tasks( def convert_image_to_video_dataset( dataset: LeRobotDataset, - output_dir: Path, + output_dir: Path | None = None, repo_id: str | None = None, vcodec: str = "libsvtav1", pix_fmt: str = "yuv420p", @@ -1548,8 +1548,8 @@ def convert_image_to_video_dataset( Args: dataset: The source LeRobot dataset with images - output_dir: Directory to save the new video dataset - repo_id: Repository ID for the new dataset (default: original_id + "_video") + output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. + repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. vcodec: Video codec (default: libsvtav1) pix_fmt: Pixel format (default: yuv420p) g: Group of pictures size (default: 2) @@ -1600,6 +1600,7 @@ def convert_image_to_video_dataset( # Video info will be updated after episodes are encoded # Create new metadata for video dataset + output_dir = Path(output_dir) if output_dir is not None else HF_LEROBOT_HOME / repo_id new_meta = LeRobotDatasetMetadata.create( repo_id=repo_id, fps=dataset.meta.fps, diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index afdc95efd..49825317d 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -21,6 +21,9 @@ This script allows you to delete episodes, split datasets, merge datasets, remove features, modify tasks, and convert image datasets to video format. When new_repo_id is specified, creates a new dataset. +Path semantics (v2): --root and --new_root are exact dataset folders containing +meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}. + Usage Examples: Delete episodes 0, 2, and 5 from a dataset: @@ -29,16 +32,31 @@ Delete episodes 0, 2, and 5 from a dataset: --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" -Delete episodes and save to a new dataset: +Delete episodes from a local dataset at a specific path: lerobot-edit-dataset \ --repo_id lerobot/pusht \ - --new_repo_id lerobot/pusht_filtered \ + --root /path/to/pusht \ --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" -Split dataset by fractions: +Delete episodes and save to a new dataset at a specific path and with a new repo_id: lerobot-edit-dataset \ --repo_id lerobot/pusht \ + --new_repo_id lerobot/pusht_filtered \ + --new_root /path/to/pusht_filtered \ + --operation.type delete_episodes \ + --operation.episode_indices "[0, 2, 5]" + +Split dataset by fractions (pusht_train, pusht_val): + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --operation.type split \ + --operation.splits '{"train": 0.8, "val": 0.2}' + +Split dataset by fractions and save split datasets to a specific folder (base_folder/train, base_folder/val): + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --new_root /path/to/base_folder \ --operation.type split \ --operation.splits '{"train": 0.8, "val": 0.2}' @@ -56,15 +74,29 @@ Split into more than two splits: Merge multiple datasets: lerobot-edit-dataset \ - --repo_id lerobot/pusht_merged \ + --new_repo_id lerobot/pusht_merged \ --operation.type merge \ --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" +Merge multiple datasets to a specific output path: + lerobot-edit-dataset \ + --new_repo_id lerobot/pusht_merged \ + --new_root /path/to/pusht_merged \ + --operation.type merge \ + --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" + +Merge multiple datasets from a list of local dataset paths: + lerobot-edit-dataset \ + --new_repo_id lerobot/pusht_merged \ + --operation.type merge \ + --operation.repo_ids "['pusht_train', 'pusht_val']" \ + --operation.roots "['/path/to/pusht_train', '/path/to/pusht_val']" + Remove camera feature: lerobot-edit-dataset \ --repo_id lerobot/pusht \ --operation.type remove_feature \ - --operation.feature_names "['observation.images.top']" + --operation.feature_names "['observation.image']" Modify tasks - set a single task for all episodes (WARNING: modifies in-place): lerobot-edit-dataset \ @@ -88,8 +120,8 @@ Modify tasks - set default task with overrides for specific episodes (WARNING: m Convert image dataset to video format and save locally: lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_image_to_video \ - --operation.output_dir /path/to/output/pusht_video + --new_root /path/to/output/pusht_video \ + --operation.type convert_image_to_video Convert image dataset to video format and save with new repo_id: lerobot-edit-dataset \ @@ -167,6 +199,7 @@ class SplitConfig(OperationConfig): @dataclass class MergeConfig(OperationConfig): repo_ids: list[str] | None = None + roots: list[str] | None = None @OperationConfig.register_subclass("remove_feature") @@ -200,36 +233,46 @@ class ConvertImageToVideoConfig(OperationConfig): @OperationConfig.register_subclass("info") @dataclass class InfoConfig(OperationConfig): - type: str = "info" show_features: bool = False @dataclass class EditDatasetConfig: - repo_id: str + # Operation configuration. operation: OperationConfig + # Input dataset identifier. Always required unless for Merge operation. + repo_id: str | None = None + # Root directory where the input dataset is stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. root: str | None = None + # Edited dataset identifier. When both new_repo_id (resp. new_root) and repo_id (resp. root) are identical, modifications are applied in-place and a backup of the original dataset is created. Required for Merge operation. new_repo_id: str | None = None + # Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/new_repo_id. For Split operation, this is the base directory for the split datasets. + new_root: str | None = None + # Upload dataset to Hugging Face hub. push_to_hub: bool = False -def get_output_path(repo_id: str, new_repo_id: str | None, root: Path | None) -> tuple[str, Path]: - if new_repo_id: - output_repo_id = new_repo_id - output_dir = root / new_repo_id if root else HF_LEROBOT_HOME / new_repo_id - else: - output_repo_id = repo_id - dataset_path = root / repo_id if root else HF_LEROBOT_HOME / repo_id - old_path = Path(str(dataset_path) + "_old") +def get_output_path( + repo_id: str, + new_repo_id: str | None, + root: Path | str | None, + new_root: Path | str | None, +) -> tuple[str, Path]: + input_path = Path(root) if root else HF_LEROBOT_HOME / repo_id - if dataset_path.exists(): - if old_path.exists(): - shutil.rmtree(old_path) - shutil.move(str(dataset_path), str(old_path)) + output_repo_id = new_repo_id if new_repo_id else repo_id + output_path = Path(new_root) if new_root else HF_LEROBOT_HOME / output_repo_id - output_dir = dataset_path + # In case of in-place modification, create a backup of the original dataset (if it exists) + if output_path == input_path: + backup_path = input_path.with_name(input_path.name + "_old") - return output_repo_id, output_dir + if input_path.exists(): + if backup_path.exists(): + shutil.rmtree(backup_path) + shutil.move(input_path, backup_path) + + return output_repo_id, output_path def handle_delete_episodes(cfg: EditDatasetConfig) -> None: @@ -241,11 +284,15 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) output_repo_id, output_dir = get_output_path( - cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None + cfg.repo_id, + new_repo_id=cfg.new_repo_id, + root=cfg.root, + new_root=cfg.new_root, ) - if cfg.new_repo_id is None: - dataset.root = Path(str(dataset.root) + "_old") + # In case of in-place modification, make the dataset point to the backup directory + if output_dir == dataset.root: + dataset.root = dataset.root.with_name(dataset.root.name + "_old") logging.info(f"Deleting episodes {cfg.operation.episode_indices} from {cfg.repo_id}") new_dataset = delete_episodes( @@ -272,19 +319,27 @@ def handle_split(cfg: EditDatasetConfig) -> None: "splits dict must be specified with split names as keys and fractions/episode lists as values" ) + if cfg.new_repo_id is not None: + logging.warning( + "split uses the original dataset identifier --repo_id to generate split names. The --new_repo_id parameter is ignored." + ) + dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) logging.info(f"Splitting dataset {cfg.repo_id} with splits: {cfg.operation.splits}") - split_datasets = split_dataset(dataset, splits=cfg.operation.splits) + split_datasets = split_dataset( + dataset, + splits=cfg.operation.splits, + output_dir=cfg.new_root, + ) for split_name, split_ds in split_datasets.items(): - split_repo_id = f"{cfg.repo_id}_{split_name}" logging.info( f"{split_name}: {split_ds.meta.total_episodes} episodes, {split_ds.meta.total_frames} frames" ) if cfg.push_to_hub: - logging.info(f"Pushing {split_name} split to hub as {split_repo_id}") + logging.info(f"Pushing {split_name} split to hub as {split_ds.repo_id}") LeRobotDataset(split_ds.repo_id, root=split_ds.root).push_to_hub() @@ -295,18 +350,29 @@ def handle_merge(cfg: EditDatasetConfig) -> None: if not cfg.operation.repo_ids: raise ValueError("repo_ids must be specified for merge operation") - if not cfg.repo_id: - raise ValueError("repo_id must be specified as the output repository for merged dataset") + if cfg.repo_id is not None or cfg.root is not None: + logging.warning( + "merge uses --new_repo_id and --new_root for the merged dataset. The --repo_id and --root parameters are ignored." + ) - logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge") - datasets = [LeRobotDataset(repo_id, root=cfg.root) for repo_id in cfg.operation.repo_ids] + if cfg.operation.roots: + if len(cfg.operation.roots) != len(cfg.operation.repo_ids): + raise ValueError("repo_ids and roots must have the same length for merge operation") + logging.info(f"Loading {len(cfg.operation.roots)} datasets to merge") + datasets = [ + LeRobotDataset(repo_id=repo_id, root=root) + for repo_id, root in zip(cfg.operation.repo_ids, cfg.operation.roots, strict=True) + ] + else: + logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge") + datasets = [LeRobotDataset(repo_id) for repo_id in cfg.operation.repo_ids] - output_dir = Path(cfg.root) / cfg.repo_id if cfg.root else HF_LEROBOT_HOME / cfg.repo_id + output_dir = Path(cfg.new_root) if cfg.new_root else HF_LEROBOT_HOME / cfg.new_repo_id - logging.info(f"Merging datasets into {cfg.repo_id}") + logging.info(f"Merging datasets into {cfg.new_repo_id}") merged_dataset = merge_datasets( datasets, - output_repo_id=cfg.repo_id, + output_repo_id=cfg.new_repo_id, output_dir=output_dir, ) @@ -316,7 +382,7 @@ def handle_merge(cfg: EditDatasetConfig) -> None: ) if cfg.push_to_hub: - logging.info(f"Pushing to hub as {cfg.repo_id}") + logging.info(f"Pushing to hub as {cfg.new_repo_id}") LeRobotDataset(merged_dataset.repo_id, root=output_dir).push_to_hub() @@ -329,11 +395,15 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) output_repo_id, output_dir = get_output_path( - cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None + cfg.repo_id, + new_repo_id=cfg.new_repo_id, + root=cfg.root, + new_root=cfg.new_root, ) - if cfg.new_repo_id is None: - dataset.root = Path(str(dataset.root) + "_old") + # In case of in-place modification, make the dataset point to the backup directory + if output_dir == dataset.root: + dataset.root = dataset.root.with_name(dataset.root.name + "_old") logging.info(f"Removing features {cfg.operation.feature_names} from {cfg.repo_id}") new_dataset = remove_feature( @@ -361,9 +431,10 @@ def handle_modify_tasks(cfg: EditDatasetConfig) -> None: if new_task is None and episode_tasks_raw is None: raise ValueError("Must specify at least one of new_task or episode_tasks for modify_tasks operation") - # Warn about in-place modification behavior - if cfg.new_repo_id is not None: - logging.warning("modify_tasks modifies datasets in-place. The --new_repo_id parameter is ignored.") + if cfg.new_repo_id is not None or cfg.new_root is not None: + logging.warning( + "modify_tasks modifies datasets in-place. The --new_repo_id and --new_root parameters are ignored." + ) dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) logging.warning(f"Modifying dataset in-place at {dataset.root}. Original data will be overwritten.") @@ -399,32 +470,30 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) # Determine output directory and repo_id - # Priority: 1) new_repo_id, 2) operation.output_dir, 3) auto-generated name + # Priority: 1) new_root, 2) new_repo_id, 3) operation.output_dir, 4) auto-generated name output_dir_config = getattr(cfg.operation, "output_dir", None) + if output_dir_config: + logging.warning( + "--operation.output_dir is deprecated and will be removed in future versions. " + "Please use --new_root instead." + ) - if cfg.new_repo_id: - # Use new_repo_id for both local storage and hub push + if cfg.new_root: + output_dir = Path(cfg.new_root) + output_repo_id = cfg.new_repo_id or f"{cfg.repo_id}_video" + logging.info(f"Saving to new_root: {output_dir} as {output_repo_id}") + elif cfg.new_repo_id: output_repo_id = cfg.new_repo_id - # Place new dataset as a sibling to the original dataset - # Get the parent of the actual dataset root (not cfg.root which might be the lerobot cache dir) - # Extract just the dataset name (after last slash) for the local directory - local_dir_name = cfg.new_repo_id.split("/")[-1] - output_dir = dataset.root.parent / local_dir_name + output_dir = HF_LEROBOT_HOME / cfg.new_repo_id logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}") elif output_dir_config: - # Use custom output directory for local-only storage output_dir = Path(output_dir_config) - # Extract repo name from output_dir for the dataset output_repo_id = output_dir.name - logging.info(f"Saving to local directory: {output_dir}") + logging.info(f"Saving to local directory: {output_dir} as {output_repo_id}") else: - # Auto-generate name: append "_video" to original repo_id output_repo_id = f"{cfg.repo_id}_video" - # Place new dataset as a sibling to the original dataset - # Extract just the dataset name (after last slash) for the local directory - local_dir_name = output_repo_id.split("/")[-1] - output_dir = dataset.root.parent / local_dir_name - logging.info(f"Saving to auto-generated location: {output_dir}") + output_dir = HF_LEROBOT_HOME / output_repo_id + logging.info(f"Saving to auto-generated location: {output_dir} as {output_repo_id}") logging.info(f"Converting dataset {cfg.repo_id} to video format") @@ -499,8 +568,20 @@ def handle_info(cfg: EditDatasetConfig): sys.stdout.write(f"{feature_dump_str}\n") +def _validate_config(cfg: EditDatasetConfig) -> None: + if isinstance(cfg.operation, MergeConfig): + if not cfg.new_repo_id: + raise ValueError("--new_repo_id is required for merge operation (the merged dataset identifier)") + else: + if not cfg.repo_id: + raise ValueError( + f"--repo_id is required for {cfg.operation.type} operation (the input dataset identifier)" + ) + + @parser.wrap() def edit_dataset(cfg: EditDatasetConfig) -> None: + _validate_config(cfg) operation_type = cfg.operation.type if operation_type == "delete_episodes": diff --git a/tests/scripts/test_edit_dataset_parsing.py b/tests/scripts/test_edit_dataset_parsing.py index 8800b92ee..4d758ae35 100644 --- a/tests/scripts/test_edit_dataset_parsing.py +++ b/tests/scripts/test_edit_dataset_parsing.py @@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import ( OperationConfig, RemoveFeatureConfig, SplitConfig, + _validate_config, ) @@ -51,11 +52,23 @@ class TestOperationTypeParsing: ], ) def test_operation_type_resolves_correct_class(self, type_name, expected_cls): - cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", type_name]) + cfg = parse_cfg( + ["--repo_id", "test/repo", "--new_repo_id", "test/merged", "--operation.type", type_name] + ) assert isinstance(cfg.operation, expected_cls), ( f"Expected {expected_cls.__name__}, got {type(cfg.operation).__name__}" ) + def test_merge_requires_new_repo_id(self): + cfg = parse_cfg(["--operation.type", "merge"]) + with pytest.raises(ValueError, match="--new_repo_id is required for merge"): + _validate_config(cfg) + + def test_non_merge_requires_repo_id(self): + cfg = parse_cfg(["--operation.type", "delete_episodes"]) + with pytest.raises(ValueError, match="--repo_id is required for delete_episodes"): + _validate_config(cfg) + @pytest.mark.parametrize( "type_name, expected_cls", [ @@ -69,6 +82,8 @@ class TestOperationTypeParsing: ], ) def test_get_choice_name_roundtrips(self, type_name, expected_cls): - cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", type_name]) + cfg = parse_cfg( + ["--repo_id", "test/repo", "--new_repo_id", "test/merged", "--operation.type", type_name] + ) resolved_name = OperationConfig.get_choice_name(type(cfg.operation)) assert resolved_name == type_name