diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index afdc95efd..56cf2a330 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -19,7 +19,9 @@ Edit LeRobot datasets using various transformation tools. This script allows you to delete episodes, split datasets, merge datasets, remove features, modify tasks, and convert image datasets to video format. -When new_repo_id is specified, creates a new dataset. + +Path semantics (v2): --root and --new_root are exact dataset folders containing +meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}. Usage Examples: @@ -29,10 +31,18 @@ Delete episodes 0, 2, and 5 from a dataset: --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" -Delete episodes and save to a new dataset: +Delete episodes from a local dataset at a specific path: + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --root /data/lerobot/pusht \ + --operation.type delete_episodes \ + --operation.episode_indices "[0, 2, 5]" + +Delete episodes and save to a new dataset at a specific path: lerobot-edit-dataset \ --repo_id lerobot/pusht \ --new_repo_id lerobot/pusht_filtered \ + --new_root /data/lerobot/pusht_filtered \ --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" @@ -42,6 +52,13 @@ Split dataset by fractions: --operation.type split \ --operation.splits '{"train": 0.8, "val": 0.2}' +Split dataset by fractions with custom output base directory: + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --new_root /data/splits \ + --operation.type split \ + --operation.splits '{"train": 0.8, "val": 0.2}' + Split dataset by episode indices: lerobot-edit-dataset \ --repo_id lerobot/pusht \ @@ -54,12 +71,27 @@ Split into more than two splits: --operation.type split \ --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}' -Merge multiple datasets: +Merge multiple datasets (from Hub or default cache): lerobot-edit-dataset \ --repo_id lerobot/pusht_merged \ --operation.type merge \ --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" +Merge local datasets with explicit paths: + lerobot-edit-dataset \ + --repo_id lerobot/pusht_merged \ + --operation.type merge \ + --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" \ + --operation.roots "['/data/pusht_train', '/data/pusht_val']" + +Merge local datasets with custom output path: + lerobot-edit-dataset \ + --repo_id lerobot/pusht_merged \ + --new_root /output/merged \ + --operation.type merge \ + --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" \ + --operation.roots "['/data/pusht_train', '/data/pusht_val']" + Remove camera feature: lerobot-edit-dataset \ --repo_id lerobot/pusht \ @@ -88,8 +120,8 @@ Modify tasks - set default task with overrides for specific episodes (WARNING: m Convert image dataset to video format and save locally: lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_image_to_video \ - --operation.output_dir /path/to/output/pusht_video + --new_root /path/to/output/pusht_video \ + --operation.type convert_image_to_video Convert image dataset to video format and save with new repo_id: lerobot-edit-dataset \ @@ -167,6 +199,8 @@ class SplitConfig(OperationConfig): @dataclass class MergeConfig(OperationConfig): repo_ids: list[str] | None = None + # Per-dataset exact folder paths. Must match repo_ids length when provided. + roots: list[str] | None = None @OperationConfig.register_subclass("remove_feature") @@ -208,26 +242,35 @@ class InfoConfig(OperationConfig): class EditDatasetConfig: repo_id: str operation: OperationConfig + # Exact dataset folder (containing meta/, data/, videos/). If None, defaults to HF_LEROBOT_HOME/{repo_id}. root: str | None = None new_repo_id: str | None = None + # Exact output dataset folder. If None, derived from new_repo_id or in-place. + new_root: str | None = None push_to_hub: bool = False -def get_output_path(repo_id: str, new_repo_id: str | None, root: Path | None) -> tuple[str, Path]: - if new_repo_id: +def get_output_path( + input_root: Path, + new_repo_id: str | None, + new_root: Path | None, + default_repo_id: str, +) -> tuple[str, Path]: + if new_root: + output_repo_id = new_repo_id or default_repo_id + output_dir = new_root + elif new_repo_id: output_repo_id = new_repo_id - output_dir = root / new_repo_id if root else HF_LEROBOT_HOME / new_repo_id + output_dir = HF_LEROBOT_HOME / new_repo_id else: - output_repo_id = repo_id - dataset_path = root / repo_id if root else HF_LEROBOT_HOME / repo_id - old_path = Path(str(dataset_path) + "_old") + output_repo_id = default_repo_id + output_dir = input_root + backup_path = input_root.with_name(input_root.name + "_old") - if dataset_path.exists(): - if old_path.exists(): - shutil.rmtree(old_path) - shutil.move(str(dataset_path), str(old_path)) - - output_dir = dataset_path + if input_root.exists(): + if backup_path.exists(): + shutil.rmtree(backup_path) + shutil.move(str(input_root), str(backup_path)) return output_repo_id, output_dir @@ -241,11 +284,14 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) output_repo_id, output_dir = get_output_path( - cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None + input_root=dataset.root, + new_repo_id=cfg.new_repo_id, + new_root=Path(cfg.new_root) if cfg.new_root else None, + default_repo_id=cfg.repo_id, ) - if cfg.new_repo_id is None: - dataset.root = Path(str(dataset.root) + "_old") + if output_dir == dataset.root: + dataset.root = dataset.root.with_name(dataset.root.name + "_old") logging.info(f"Deleting episodes {cfg.operation.episode_indices} from {cfg.repo_id}") new_dataset = delete_episodes( @@ -275,16 +321,20 @@ def handle_split(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) logging.info(f"Splitting dataset {cfg.repo_id} with splits: {cfg.operation.splits}") - split_datasets = split_dataset(dataset, splits=cfg.operation.splits) + split_output_dir = Path(cfg.new_root) if cfg.new_root else None + split_datasets = split_dataset( + dataset, + splits=cfg.operation.splits, + output_dir=split_output_dir, + ) for split_name, split_ds in split_datasets.items(): - split_repo_id = f"{cfg.repo_id}_{split_name}" logging.info( f"{split_name}: {split_ds.meta.total_episodes} episodes, {split_ds.meta.total_frames} frames" ) if cfg.push_to_hub: - logging.info(f"Pushing {split_name} split to hub as {split_repo_id}") + logging.info(f"Pushing {split_name} split to hub as {split_ds.repo_id}") LeRobotDataset(split_ds.repo_id, root=split_ds.root).push_to_hub() @@ -298,10 +348,19 @@ def handle_merge(cfg: EditDatasetConfig) -> None: if not cfg.repo_id: raise ValueError("repo_id must be specified as the output repository for merged dataset") - logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge") - datasets = [LeRobotDataset(repo_id, root=cfg.root) for repo_id in cfg.operation.repo_ids] + if cfg.operation.roots: + if len(cfg.operation.roots) != len(cfg.operation.repo_ids): + raise ValueError("operation.roots and operation.repo_ids must have the same length") + logging.info(f"Loading {len(cfg.operation.roots)} local datasets to merge") + datasets = [ + LeRobotDataset(repo_id, root=root) + for repo_id, root in zip(cfg.operation.repo_ids, cfg.operation.roots, strict=True) + ] + else: + logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge") + datasets = [LeRobotDataset(repo_id) for repo_id in cfg.operation.repo_ids] - output_dir = Path(cfg.root) / cfg.repo_id if cfg.root else HF_LEROBOT_HOME / cfg.repo_id + output_dir = Path(cfg.new_root) if cfg.new_root else HF_LEROBOT_HOME / cfg.repo_id logging.info(f"Merging datasets into {cfg.repo_id}") merged_dataset = merge_datasets( @@ -329,11 +388,14 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) output_repo_id, output_dir = get_output_path( - cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None + input_root=dataset.root, + new_repo_id=cfg.new_repo_id, + new_root=Path(cfg.new_root) if cfg.new_root else None, + default_repo_id=cfg.repo_id, ) - if cfg.new_repo_id is None: - dataset.root = Path(str(dataset.root) + "_old") + if output_dir == dataset.root: + dataset.root = dataset.root.with_name(dataset.root.name + "_old") logging.info(f"Removing features {cfg.operation.feature_names} from {cfg.repo_id}") new_dataset = remove_feature( @@ -361,9 +423,10 @@ def handle_modify_tasks(cfg: EditDatasetConfig) -> None: if new_task is None and episode_tasks_raw is None: raise ValueError("Must specify at least one of new_task or episode_tasks for modify_tasks operation") - # Warn about in-place modification behavior - if cfg.new_repo_id is not None: - logging.warning("modify_tasks modifies datasets in-place. The --new_repo_id parameter is ignored.") + if cfg.new_repo_id is not None or cfg.new_root is not None: + logging.warning( + "modify_tasks modifies datasets in-place. The --new_repo_id and --new_root parameters are ignored." + ) dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) logging.warning(f"Modifying dataset in-place at {dataset.root}. Original data will be overwritten.") @@ -399,31 +462,24 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) # Determine output directory and repo_id - # Priority: 1) new_repo_id, 2) operation.output_dir, 3) auto-generated name + # Priority: 1) new_root, 2) new_repo_id, 3) operation.output_dir, 4) auto-generated name output_dir_config = getattr(cfg.operation, "output_dir", None) - if cfg.new_repo_id: - # Use new_repo_id for both local storage and hub push + if cfg.new_root: + output_dir = Path(cfg.new_root) + output_repo_id = cfg.new_repo_id or f"{cfg.repo_id}_video" + logging.info(f"Saving to {output_dir} as {output_repo_id}") + elif cfg.new_repo_id: output_repo_id = cfg.new_repo_id - # Place new dataset as a sibling to the original dataset - # Get the parent of the actual dataset root (not cfg.root which might be the lerobot cache dir) - # Extract just the dataset name (after last slash) for the local directory - local_dir_name = cfg.new_repo_id.split("/")[-1] - output_dir = dataset.root.parent / local_dir_name + output_dir = HF_LEROBOT_HOME / cfg.new_repo_id logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}") elif output_dir_config: - # Use custom output directory for local-only storage output_dir = Path(output_dir_config) - # Extract repo name from output_dir for the dataset output_repo_id = output_dir.name logging.info(f"Saving to local directory: {output_dir}") else: - # Auto-generate name: append "_video" to original repo_id output_repo_id = f"{cfg.repo_id}_video" - # Place new dataset as a sibling to the original dataset - # Extract just the dataset name (after last slash) for the local directory - local_dir_name = output_repo_id.split("/")[-1] - output_dir = dataset.root.parent / local_dir_name + output_dir = HF_LEROBOT_HOME / output_repo_id logging.info(f"Saving to auto-generated location: {output_dir}") logging.info(f"Converting dataset {cfg.repo_id} to video format")