Compare commits

...

1 Commits

Author SHA1 Message Date
Khalil Meftah 8e24cfc68b fix(edit-dataset): align root with LeRobotDataset v2 semantics 2026-03-02 14:29:23 +01:00
+103 -47
View File
@@ -19,7 +19,9 @@ Edit LeRobot datasets using various transformation tools.
This script allows you to delete episodes, split datasets, merge datasets,
remove features, modify tasks, and convert image datasets to video format.
When new_repo_id is specified, creates a new dataset.
Path semantics (v2): --root and --new_root are exact dataset folders containing
meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}.
Usage Examples:
@@ -29,10 +31,18 @@ Delete episodes 0, 2, and 5 from a dataset:
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Delete episodes and save to a new dataset:
Delete episodes from a local dataset at a specific path:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--root /data/lerobot/pusht \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Delete episodes and save to a new dataset at a specific path:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--new_repo_id lerobot/pusht_filtered \
--new_root /data/lerobot/pusht_filtered \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
@@ -42,6 +52,13 @@ Split dataset by fractions:
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}'
Split dataset by fractions with custom output base directory:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--new_root /data/splits \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}'
Split dataset by episode indices:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
@@ -54,12 +71,27 @@ Split into more than two splits:
--operation.type split \
--operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'
Merge multiple datasets:
Merge multiple datasets (from Hub or default cache):
lerobot-edit-dataset \
--repo_id lerobot/pusht_merged \
--operation.type merge \
--operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']"
Merge local datasets with explicit paths:
lerobot-edit-dataset \
--repo_id lerobot/pusht_merged \
--operation.type merge \
--operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" \
--operation.roots "['/data/pusht_train', '/data/pusht_val']"
Merge local datasets with custom output path:
lerobot-edit-dataset \
--repo_id lerobot/pusht_merged \
--new_root /output/merged \
--operation.type merge \
--operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']" \
--operation.roots "['/data/pusht_train', '/data/pusht_val']"
Remove camera feature:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
@@ -88,8 +120,8 @@ Modify tasks - set default task with overrides for specific episodes (WARNING: m
Convert image dataset to video format and save locally:
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type convert_image_to_video \
--operation.output_dir /path/to/output/pusht_video
--new_root /path/to/output/pusht_video \
--operation.type convert_image_to_video
Convert image dataset to video format and save with new repo_id:
lerobot-edit-dataset \
@@ -167,6 +199,8 @@ class SplitConfig(OperationConfig):
@dataclass
class MergeConfig(OperationConfig):
repo_ids: list[str] | None = None
# Per-dataset exact folder paths. Must match repo_ids length when provided.
roots: list[str] | None = None
@OperationConfig.register_subclass("remove_feature")
@@ -208,26 +242,35 @@ class InfoConfig(OperationConfig):
class EditDatasetConfig:
repo_id: str
operation: OperationConfig
# Exact dataset folder (containing meta/, data/, videos/). If None, defaults to HF_LEROBOT_HOME/{repo_id}.
root: str | None = None
new_repo_id: str | None = None
# Exact output dataset folder. If None, derived from new_repo_id or in-place.
new_root: str | None = None
push_to_hub: bool = False
def get_output_path(repo_id: str, new_repo_id: str | None, root: Path | None) -> tuple[str, Path]:
if new_repo_id:
def get_output_path(
input_root: Path,
new_repo_id: str | None,
new_root: Path | None,
default_repo_id: str,
) -> tuple[str, Path]:
if new_root:
output_repo_id = new_repo_id or default_repo_id
output_dir = new_root
elif new_repo_id:
output_repo_id = new_repo_id
output_dir = root / new_repo_id if root else HF_LEROBOT_HOME / new_repo_id
output_dir = HF_LEROBOT_HOME / new_repo_id
else:
output_repo_id = repo_id
dataset_path = root / repo_id if root else HF_LEROBOT_HOME / repo_id
old_path = Path(str(dataset_path) + "_old")
output_repo_id = default_repo_id
output_dir = input_root
backup_path = input_root.with_name(input_root.name + "_old")
if dataset_path.exists():
if old_path.exists():
shutil.rmtree(old_path)
shutil.move(str(dataset_path), str(old_path))
output_dir = dataset_path
if input_root.exists():
if backup_path.exists():
shutil.rmtree(backup_path)
shutil.move(str(input_root), str(backup_path))
return output_repo_id, output_dir
@@ -241,11 +284,14 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
output_repo_id, output_dir = get_output_path(
cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None
input_root=dataset.root,
new_repo_id=cfg.new_repo_id,
new_root=Path(cfg.new_root) if cfg.new_root else None,
default_repo_id=cfg.repo_id,
)
if cfg.new_repo_id is None:
dataset.root = Path(str(dataset.root) + "_old")
if output_dir == dataset.root:
dataset.root = dataset.root.with_name(dataset.root.name + "_old")
logging.info(f"Deleting episodes {cfg.operation.episode_indices} from {cfg.repo_id}")
new_dataset = delete_episodes(
@@ -275,16 +321,20 @@ def handle_split(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
logging.info(f"Splitting dataset {cfg.repo_id} with splits: {cfg.operation.splits}")
split_datasets = split_dataset(dataset, splits=cfg.operation.splits)
split_output_dir = Path(cfg.new_root) if cfg.new_root else None
split_datasets = split_dataset(
dataset,
splits=cfg.operation.splits,
output_dir=split_output_dir,
)
for split_name, split_ds in split_datasets.items():
split_repo_id = f"{cfg.repo_id}_{split_name}"
logging.info(
f"{split_name}: {split_ds.meta.total_episodes} episodes, {split_ds.meta.total_frames} frames"
)
if cfg.push_to_hub:
logging.info(f"Pushing {split_name} split to hub as {split_repo_id}")
logging.info(f"Pushing {split_name} split to hub as {split_ds.repo_id}")
LeRobotDataset(split_ds.repo_id, root=split_ds.root).push_to_hub()
@@ -298,10 +348,19 @@ def handle_merge(cfg: EditDatasetConfig) -> None:
if not cfg.repo_id:
raise ValueError("repo_id must be specified as the output repository for merged dataset")
logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge")
datasets = [LeRobotDataset(repo_id, root=cfg.root) for repo_id in cfg.operation.repo_ids]
if cfg.operation.roots:
if len(cfg.operation.roots) != len(cfg.operation.repo_ids):
raise ValueError("operation.roots and operation.repo_ids must have the same length")
logging.info(f"Loading {len(cfg.operation.roots)} local datasets to merge")
datasets = [
LeRobotDataset(repo_id, root=root)
for repo_id, root in zip(cfg.operation.repo_ids, cfg.operation.roots, strict=True)
]
else:
logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge")
datasets = [LeRobotDataset(repo_id) for repo_id in cfg.operation.repo_ids]
output_dir = Path(cfg.root) / cfg.repo_id if cfg.root else HF_LEROBOT_HOME / cfg.repo_id
output_dir = Path(cfg.new_root) if cfg.new_root else HF_LEROBOT_HOME / cfg.repo_id
logging.info(f"Merging datasets into {cfg.repo_id}")
merged_dataset = merge_datasets(
@@ -329,11 +388,14 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
output_repo_id, output_dir = get_output_path(
cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None
input_root=dataset.root,
new_repo_id=cfg.new_repo_id,
new_root=Path(cfg.new_root) if cfg.new_root else None,
default_repo_id=cfg.repo_id,
)
if cfg.new_repo_id is None:
dataset.root = Path(str(dataset.root) + "_old")
if output_dir == dataset.root:
dataset.root = dataset.root.with_name(dataset.root.name + "_old")
logging.info(f"Removing features {cfg.operation.feature_names} from {cfg.repo_id}")
new_dataset = remove_feature(
@@ -361,9 +423,10 @@ def handle_modify_tasks(cfg: EditDatasetConfig) -> None:
if new_task is None and episode_tasks_raw is None:
raise ValueError("Must specify at least one of new_task or episode_tasks for modify_tasks operation")
# Warn about in-place modification behavior
if cfg.new_repo_id is not None:
logging.warning("modify_tasks modifies datasets in-place. The --new_repo_id parameter is ignored.")
if cfg.new_repo_id is not None or cfg.new_root is not None:
logging.warning(
"modify_tasks modifies datasets in-place. The --new_repo_id and --new_root parameters are ignored."
)
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
logging.warning(f"Modifying dataset in-place at {dataset.root}. Original data will be overwritten.")
@@ -399,31 +462,24 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
# Determine output directory and repo_id
# Priority: 1) new_repo_id, 2) operation.output_dir, 3) auto-generated name
# Priority: 1) new_root, 2) new_repo_id, 3) operation.output_dir, 4) auto-generated name
output_dir_config = getattr(cfg.operation, "output_dir", None)
if cfg.new_repo_id:
# Use new_repo_id for both local storage and hub push
if cfg.new_root:
output_dir = Path(cfg.new_root)
output_repo_id = cfg.new_repo_id or f"{cfg.repo_id}_video"
logging.info(f"Saving to {output_dir} as {output_repo_id}")
elif cfg.new_repo_id:
output_repo_id = cfg.new_repo_id
# Place new dataset as a sibling to the original dataset
# Get the parent of the actual dataset root (not cfg.root which might be the lerobot cache dir)
# Extract just the dataset name (after last slash) for the local directory
local_dir_name = cfg.new_repo_id.split("/")[-1]
output_dir = dataset.root.parent / local_dir_name
output_dir = HF_LEROBOT_HOME / cfg.new_repo_id
logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}")
elif output_dir_config:
# Use custom output directory for local-only storage
output_dir = Path(output_dir_config)
# Extract repo name from output_dir for the dataset
output_repo_id = output_dir.name
logging.info(f"Saving to local directory: {output_dir}")
else:
# Auto-generate name: append "_video" to original repo_id
output_repo_id = f"{cfg.repo_id}_video"
# Place new dataset as a sibling to the original dataset
# Extract just the dataset name (after last slash) for the local directory
local_dir_name = output_repo_id.split("/")[-1]
output_dir = dataset.root.parent / local_dir_name
output_dir = HF_LEROBOT_HOME / output_repo_id
logging.info(f"Saving to auto-generated location: {output_dir}")
logging.info(f"Converting dataset {cfg.repo_id} to video format")