fix(dataset edit tools): clarifying root argument usage + adding related features (#3049)

* fix(root): adding proper support for the root and new_root arguments

* feat(roots): adding a roots agrument for the merge operation

* chore(clean): cleaning up code

* chore(doctrings): updating doctrings with new features

* fix(repo_id): setting repo_id to None when not needed

* fix(roots/repo_ids): making mypy happy by using repo_ids and roots for merge operation

* fix(path): fixing path related issues

* fix(repo_id): fixing issues related to repo_id

* chore(doctrings): updating docstrings + fix typo

* chore(clean): cleaning code

* fix(split new_repo_id): reverting new_repo_id addition for split operation

* docs(dosctrings): completing docstrings

* fix(repo_ids/roots): improving checks for repo_ids/roots lengths

* fix(repo_ids): making repo_ids optional in MergeConfig but raise if not given

* fix(docstrings): fixing docstrings for split operation

* fix(hints): updating get_output_path hints to accept paths as strings too

* fix(y/N prompts): removing y/N prompts in lerobot_edit_dataset

* fix(merge repo_id): fixing merge operation to use new_repo_id instead of repo_id

* fix(typo): fixing typo in doctrings
This commit is contained in:
Caroline Pascal
2026-03-03 15:40:46 +01:00
committed by GitHub
parent 8a0cc3d664
commit 63dca86df8
3 changed files with 173 additions and 76 deletions
+15 -14
View File
@@ -89,8 +89,8 @@ def delete_episodes(
Args:
dataset: The source LeRobotDataset.
episode_indices: List of episode indices to delete.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
"""
if not episode_indices:
raise ValueError("No episodes to delete")
@@ -152,7 +152,7 @@ def split_dataset(
dataset: The source LeRobotDataset to split.
splits: Either a dict mapping split names to episode indices, or a dict mapping
split names to fractions (must sum to <= 1.0).
output_dir: Base directory for output datasets. If None, uses default location.
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
Examples:
Split by specific episodes
@@ -243,8 +243,8 @@ def merge_datasets(
Args:
datasets: List of LeRobotDatasets to merge.
output_repo_id: Repository ID for the merged dataset.
output_dir: Directory to save the merged dataset. If None, uses default location.
output_repo_id: Merged dataset identifier.
output_dir: Root directory where the merged dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/output_repo_id.
"""
if not datasets:
raise ValueError("No datasets to merge")
@@ -288,8 +288,8 @@ def modify_features(
dataset: The source LeRobotDataset.
add_features: Optional dict mapping feature names to (feature_values, feature_info) tuples.
remove_features: Optional feature name(s) to remove. Can be a single string or list.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
Returns:
New dataset with features modified.
@@ -390,8 +390,8 @@ def add_features(
Args:
dataset: The source LeRobotDataset.
features: Dictionary mapping feature names to (feature_values, feature_info) tuples.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
Returns:
New dataset with all features added.
@@ -427,8 +427,8 @@ def remove_feature(
Args:
dataset: The source LeRobotDataset.
feature_names: Name(s) of features to remove. Can be a single string or list.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
Returns:
New dataset with features removed.
@@ -1529,7 +1529,7 @@ def modify_tasks(
def convert_image_to_video_dataset(
dataset: LeRobotDataset,
output_dir: Path,
output_dir: Path | None = None,
repo_id: str | None = None,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
@@ -1548,8 +1548,8 @@ def convert_image_to_video_dataset(
Args:
dataset: The source LeRobot dataset with images
output_dir: Directory to save the new video dataset
repo_id: Repository ID for the new dataset (default: original_id + "_video")
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
vcodec: Video codec (default: libsvtav1)
pix_fmt: Pixel format (default: yuv420p)
g: Group of pictures size (default: 2)
@@ -1600,6 +1600,7 @@ def convert_image_to_video_dataset(
# Video info will be updated after episodes are encoded
# Create new metadata for video dataset
output_dir = Path(output_dir) if output_dir is not None else HF_LEROBOT_HOME / repo_id
new_meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
fps=dataset.meta.fps,
+141 -60
View File
@@ -21,6 +21,9 @@ This script allows you to delete episodes, split datasets, merge datasets,
remove features, modify tasks, and convert image datasets to video format.
When new_repo_id is specified, creates a new dataset.
Path semantics (v2): --root and --new_root are exact dataset folders containing
meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}.
Usage Examples:
Delete episodes 0, 2, and 5 from a dataset:
@@ -29,16 +32,31 @@ Delete episodes 0, 2, and 5 from a dataset:
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Delete episodes and save to a new dataset:
Delete episodes from a local dataset at a specific path:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--new_repo_id lerobot/pusht_filtered \
--root /path/to/pusht \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Split dataset by fractions:
Delete episodes and save to a new dataset at a specific path and with a new repo_id:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--new_repo_id lerobot/pusht_filtered \
--new_root /path/to/pusht_filtered \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Split dataset by fractions (pusht_train, pusht_val):
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}'
Split dataset by fractions and save split datasets to a specific folder (base_folder/train, base_folder/val):
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--new_root /path/to/base_folder \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}'
@@ -56,15 +74,29 @@ Split into more than two splits:
Merge multiple datasets:
lerobot-edit-dataset \
--repo_id lerobot/pusht_merged \
--new_repo_id lerobot/pusht_merged \
--operation.type merge \
--operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']"
Merge multiple datasets to a specific output path:
lerobot-edit-dataset \
--new_repo_id lerobot/pusht_merged \
--new_root /path/to/pusht_merged \
--operation.type merge \
--operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']"
Merge multiple datasets from a list of local dataset paths:
lerobot-edit-dataset \
--new_repo_id lerobot/pusht_merged \
--operation.type merge \
--operation.repo_ids "['pusht_train', 'pusht_val']" \
--operation.roots "['/path/to/pusht_train', '/path/to/pusht_val']"
Remove camera feature:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type remove_feature \
--operation.feature_names "['observation.images.top']"
--operation.feature_names "['observation.image']"
Modify tasks - set a single task for all episodes (WARNING: modifies in-place):
lerobot-edit-dataset \
@@ -88,8 +120,8 @@ Modify tasks - set default task with overrides for specific episodes (WARNING: m
Convert image dataset to video format and save locally:
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type convert_image_to_video \
--operation.output_dir /path/to/output/pusht_video
--new_root /path/to/output/pusht_video \
--operation.type convert_image_to_video
Convert image dataset to video format and save with new repo_id:
lerobot-edit-dataset \
@@ -167,6 +199,7 @@ class SplitConfig(OperationConfig):
@dataclass
class MergeConfig(OperationConfig):
repo_ids: list[str] | None = None
roots: list[str] | None = None
@OperationConfig.register_subclass("remove_feature")
@@ -200,36 +233,46 @@ class ConvertImageToVideoConfig(OperationConfig):
@OperationConfig.register_subclass("info")
@dataclass
class InfoConfig(OperationConfig):
type: str = "info"
show_features: bool = False
@dataclass
class EditDatasetConfig:
repo_id: str
# Operation configuration.
operation: OperationConfig
# Input dataset identifier. Always required unless for Merge operation.
repo_id: str | None = None
# Root directory where the input dataset is stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
root: str | None = None
# Edited dataset identifier. When both new_repo_id (resp. new_root) and repo_id (resp. root) are identical, modifications are applied in-place and a backup of the original dataset is created. Required for Merge operation.
new_repo_id: str | None = None
# Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/new_repo_id. For Split operation, this is the base directory for the split datasets.
new_root: str | None = None
# Upload dataset to Hugging Face hub.
push_to_hub: bool = False
def get_output_path(repo_id: str, new_repo_id: str | None, root: Path | None) -> tuple[str, Path]:
if new_repo_id:
output_repo_id = new_repo_id
output_dir = root / new_repo_id if root else HF_LEROBOT_HOME / new_repo_id
else:
output_repo_id = repo_id
dataset_path = root / repo_id if root else HF_LEROBOT_HOME / repo_id
old_path = Path(str(dataset_path) + "_old")
def get_output_path(
repo_id: str,
new_repo_id: str | None,
root: Path | str | None,
new_root: Path | str | None,
) -> tuple[str, Path]:
input_path = Path(root) if root else HF_LEROBOT_HOME / repo_id
if dataset_path.exists():
if old_path.exists():
shutil.rmtree(old_path)
shutil.move(str(dataset_path), str(old_path))
output_repo_id = new_repo_id if new_repo_id else repo_id
output_path = Path(new_root) if new_root else HF_LEROBOT_HOME / output_repo_id
output_dir = dataset_path
# In case of in-place modification, create a backup of the original dataset (if it exists)
if output_path == input_path:
backup_path = input_path.with_name(input_path.name + "_old")
return output_repo_id, output_dir
if input_path.exists():
if backup_path.exists():
shutil.rmtree(backup_path)
shutil.move(input_path, backup_path)
return output_repo_id, output_path
def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
@@ -241,11 +284,15 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
output_repo_id, output_dir = get_output_path(
cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None
cfg.repo_id,
new_repo_id=cfg.new_repo_id,
root=cfg.root,
new_root=cfg.new_root,
)
if cfg.new_repo_id is None:
dataset.root = Path(str(dataset.root) + "_old")
# In case of in-place modification, make the dataset point to the backup directory
if output_dir == dataset.root:
dataset.root = dataset.root.with_name(dataset.root.name + "_old")
logging.info(f"Deleting episodes {cfg.operation.episode_indices} from {cfg.repo_id}")
new_dataset = delete_episodes(
@@ -272,19 +319,27 @@ def handle_split(cfg: EditDatasetConfig) -> None:
"splits dict must be specified with split names as keys and fractions/episode lists as values"
)
if cfg.new_repo_id is not None:
logging.warning(
"split uses the original dataset identifier --repo_id to generate split names. The --new_repo_id parameter is ignored."
)
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
logging.info(f"Splitting dataset {cfg.repo_id} with splits: {cfg.operation.splits}")
split_datasets = split_dataset(dataset, splits=cfg.operation.splits)
split_datasets = split_dataset(
dataset,
splits=cfg.operation.splits,
output_dir=cfg.new_root,
)
for split_name, split_ds in split_datasets.items():
split_repo_id = f"{cfg.repo_id}_{split_name}"
logging.info(
f"{split_name}: {split_ds.meta.total_episodes} episodes, {split_ds.meta.total_frames} frames"
)
if cfg.push_to_hub:
logging.info(f"Pushing {split_name} split to hub as {split_repo_id}")
logging.info(f"Pushing {split_name} split to hub as {split_ds.repo_id}")
LeRobotDataset(split_ds.repo_id, root=split_ds.root).push_to_hub()
@@ -295,18 +350,29 @@ def handle_merge(cfg: EditDatasetConfig) -> None:
if not cfg.operation.repo_ids:
raise ValueError("repo_ids must be specified for merge operation")
if not cfg.repo_id:
raise ValueError("repo_id must be specified as the output repository for merged dataset")
if cfg.repo_id is not None or cfg.root is not None:
logging.warning(
"merge uses --new_repo_id and --new_root for the merged dataset. The --repo_id and --root parameters are ignored."
)
logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge")
datasets = [LeRobotDataset(repo_id, root=cfg.root) for repo_id in cfg.operation.repo_ids]
if cfg.operation.roots:
if len(cfg.operation.roots) != len(cfg.operation.repo_ids):
raise ValueError("repo_ids and roots must have the same length for merge operation")
logging.info(f"Loading {len(cfg.operation.roots)} datasets to merge")
datasets = [
LeRobotDataset(repo_id=repo_id, root=root)
for repo_id, root in zip(cfg.operation.repo_ids, cfg.operation.roots, strict=True)
]
else:
logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge")
datasets = [LeRobotDataset(repo_id) for repo_id in cfg.operation.repo_ids]
output_dir = Path(cfg.root) / cfg.repo_id if cfg.root else HF_LEROBOT_HOME / cfg.repo_id
output_dir = Path(cfg.new_root) if cfg.new_root else HF_LEROBOT_HOME / cfg.new_repo_id
logging.info(f"Merging datasets into {cfg.repo_id}")
logging.info(f"Merging datasets into {cfg.new_repo_id}")
merged_dataset = merge_datasets(
datasets,
output_repo_id=cfg.repo_id,
output_repo_id=cfg.new_repo_id,
output_dir=output_dir,
)
@@ -316,7 +382,7 @@ def handle_merge(cfg: EditDatasetConfig) -> None:
)
if cfg.push_to_hub:
logging.info(f"Pushing to hub as {cfg.repo_id}")
logging.info(f"Pushing to hub as {cfg.new_repo_id}")
LeRobotDataset(merged_dataset.repo_id, root=output_dir).push_to_hub()
@@ -329,11 +395,15 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
output_repo_id, output_dir = get_output_path(
cfg.repo_id, cfg.new_repo_id, Path(cfg.root) if cfg.root else None
cfg.repo_id,
new_repo_id=cfg.new_repo_id,
root=cfg.root,
new_root=cfg.new_root,
)
if cfg.new_repo_id is None:
dataset.root = Path(str(dataset.root) + "_old")
# In case of in-place modification, make the dataset point to the backup directory
if output_dir == dataset.root:
dataset.root = dataset.root.with_name(dataset.root.name + "_old")
logging.info(f"Removing features {cfg.operation.feature_names} from {cfg.repo_id}")
new_dataset = remove_feature(
@@ -361,9 +431,10 @@ def handle_modify_tasks(cfg: EditDatasetConfig) -> None:
if new_task is None and episode_tasks_raw is None:
raise ValueError("Must specify at least one of new_task or episode_tasks for modify_tasks operation")
# Warn about in-place modification behavior
if cfg.new_repo_id is not None:
logging.warning("modify_tasks modifies datasets in-place. The --new_repo_id parameter is ignored.")
if cfg.new_repo_id is not None or cfg.new_root is not None:
logging.warning(
"modify_tasks modifies datasets in-place. The --new_repo_id and --new_root parameters are ignored."
)
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
logging.warning(f"Modifying dataset in-place at {dataset.root}. Original data will be overwritten.")
@@ -399,32 +470,30 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
# Determine output directory and repo_id
# Priority: 1) new_repo_id, 2) operation.output_dir, 3) auto-generated name
# Priority: 1) new_root, 2) new_repo_id, 3) operation.output_dir, 4) auto-generated name
output_dir_config = getattr(cfg.operation, "output_dir", None)
if output_dir_config:
logging.warning(
"--operation.output_dir is deprecated and will be removed in future versions. "
"Please use --new_root instead."
)
if cfg.new_repo_id:
# Use new_repo_id for both local storage and hub push
if cfg.new_root:
output_dir = Path(cfg.new_root)
output_repo_id = cfg.new_repo_id or f"{cfg.repo_id}_video"
logging.info(f"Saving to new_root: {output_dir} as {output_repo_id}")
elif cfg.new_repo_id:
output_repo_id = cfg.new_repo_id
# Place new dataset as a sibling to the original dataset
# Get the parent of the actual dataset root (not cfg.root which might be the lerobot cache dir)
# Extract just the dataset name (after last slash) for the local directory
local_dir_name = cfg.new_repo_id.split("/")[-1]
output_dir = dataset.root.parent / local_dir_name
output_dir = HF_LEROBOT_HOME / cfg.new_repo_id
logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}")
elif output_dir_config:
# Use custom output directory for local-only storage
output_dir = Path(output_dir_config)
# Extract repo name from output_dir for the dataset
output_repo_id = output_dir.name
logging.info(f"Saving to local directory: {output_dir}")
logging.info(f"Saving to local directory: {output_dir} as {output_repo_id}")
else:
# Auto-generate name: append "_video" to original repo_id
output_repo_id = f"{cfg.repo_id}_video"
# Place new dataset as a sibling to the original dataset
# Extract just the dataset name (after last slash) for the local directory
local_dir_name = output_repo_id.split("/")[-1]
output_dir = dataset.root.parent / local_dir_name
logging.info(f"Saving to auto-generated location: {output_dir}")
output_dir = HF_LEROBOT_HOME / output_repo_id
logging.info(f"Saving to auto-generated location: {output_dir} as {output_repo_id}")
logging.info(f"Converting dataset {cfg.repo_id} to video format")
@@ -499,8 +568,20 @@ def handle_info(cfg: EditDatasetConfig):
sys.stdout.write(f"{feature_dump_str}\n")
def _validate_config(cfg: EditDatasetConfig) -> None:
if isinstance(cfg.operation, MergeConfig):
if not cfg.new_repo_id:
raise ValueError("--new_repo_id is required for merge operation (the merged dataset identifier)")
else:
if not cfg.repo_id:
raise ValueError(
f"--repo_id is required for {cfg.operation.type} operation (the input dataset identifier)"
)
@parser.wrap()
def edit_dataset(cfg: EditDatasetConfig) -> None:
_validate_config(cfg)
operation_type = cfg.operation.type
if operation_type == "delete_episodes":
+17 -2
View File
@@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import (
OperationConfig,
RemoveFeatureConfig,
SplitConfig,
_validate_config,
)
@@ -51,11 +52,23 @@ class TestOperationTypeParsing:
],
)
def test_operation_type_resolves_correct_class(self, type_name, expected_cls):
cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", type_name])
cfg = parse_cfg(
["--repo_id", "test/repo", "--new_repo_id", "test/merged", "--operation.type", type_name]
)
assert isinstance(cfg.operation, expected_cls), (
f"Expected {expected_cls.__name__}, got {type(cfg.operation).__name__}"
)
def test_merge_requires_new_repo_id(self):
cfg = parse_cfg(["--operation.type", "merge"])
with pytest.raises(ValueError, match="--new_repo_id is required for merge"):
_validate_config(cfg)
def test_non_merge_requires_repo_id(self):
cfg = parse_cfg(["--operation.type", "delete_episodes"])
with pytest.raises(ValueError, match="--repo_id is required for delete_episodes"):
_validate_config(cfg)
@pytest.mark.parametrize(
"type_name, expected_cls",
[
@@ -69,6 +82,8 @@ class TestOperationTypeParsing:
],
)
def test_get_choice_name_roundtrips(self, type_name, expected_cls):
cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", type_name])
cfg = parse_cfg(
["--repo_id", "test/repo", "--new_repo_id", "test/merged", "--operation.type", type_name]
)
resolved_name = OperationConfig.get_choice_name(type(cfg.operation))
assert resolved_name == type_name