From d22fa6446b6cb041098dcff73664cfd179d3a437 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Mon, 1 Dec 2025 13:46:22 +0100 Subject: [PATCH] add video encoding tool --- docs/source/using_dataset_tools.mdx | 50 +++- src/lerobot/scripts/lerobot_edit_dataset.py | 262 +++++++++++++++++++- 2 files changed, 307 insertions(+), 5 deletions(-) diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index affca0ee5..9bf345a61 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -11,13 +11,14 @@ LeRobot provides several utilities for manipulating datasets: 3. **Merge Datasets** - Combine multiple datasets into one. The datasets must have identical features, and episodes are concatenated in the order specified in `repo_ids` 4. **Add Features** - Add new features to a dataset 5. **Remove Features** - Remove features from a dataset +6. **Convert to Video** - Convert image-based datasets to video format for efficient storage The core implementation is in `lerobot.datasets.dataset_tools`. An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`. ## Command-Line Tool: lerobot-edit-dataset -`lerobot-edit-dataset` is a command-line script for editing datasets. It can be used to delete episodes, split datasets, merge datasets, add features, and remove features. +`lerobot-edit-dataset` is a command-line script for editing datasets. It can be used to delete episodes, split datasets, merge datasets, add features, remove features, and convert image datasets to video format. Run `lerobot-edit-dataset --help` for more information on the configuration of each operation. @@ -86,6 +87,53 @@ lerobot-edit-dataset \ --operation.feature_names "['observation.images.top']" ``` +#### Convert to Video + +Convert an image-based dataset to video format. This is useful for reducing storage requirements and improving data loading performance. Videos are encoded with configurable quality settings. + +```bash +# Convert all episodes to video format with default settings +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.output_dir outputs/converted_videos + +# Convert with custom video codec and quality settings +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.output_dir outputs/converted_videos \ + --operation.vcodec libsvtav1 \ + --operation.pix_fmt yuv420p \ + --operation.g 2 \ + --operation.crf 30 + +# Convert only specific episodes +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.output_dir outputs/converted_videos \ + --operation.episode_indices "[0, 1, 2, 5, 10]" + +# Convert with multiple workers for parallel processing +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.output_dir outputs/converted_videos \ + --operation.num_workers 8 +``` + +**Parameters:** +- `output_dir`: Directory where videos will be saved (default: `outputs/converted_videos`) +- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`) +- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`) +- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2) +- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30) +- `fast_decode`: Fast decode tuning option (default: 0) +- `episode_indices`: List of specific episodes to convert (default: all episodes) +- `num_workers`: Number of parallel workers for processing (default: 4) +- `overwrite`: Overwrite existing video files if they exist + ### Push to Hub Add the `--push_to_hub` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub: diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index 83ba027bc..23a68d908 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -18,7 +18,8 @@ Edit LeRobot datasets using various transformation tools. This script allows you to delete episodes, split datasets, merge datasets, -and remove features. When new_repo_id is specified, creates a new dataset. +remove features, and convert image datasets to video format. +When new_repo_id is specified, creates a new dataset. Usage Examples: @@ -65,6 +66,14 @@ Remove camera feature: --operation.type remove_feature \ --operation.feature_names "['observation.images.top']" +Convert image dataset to video format: + python -m lerobot.scripts.lerobot_edit_dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.output_dir outputs/converted_videos \ + --operation.vcodec libsvtav1 \ + --operation.crf 30 + Using JSON config file: python -m lerobot.scripts.lerobot_edit_dataset \ --config_path path/to/edit_config.json @@ -72,9 +81,12 @@ Using JSON config file: import logging import shutil +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from pathlib import Path +from tqdm import tqdm + from lerobot.configs import parser from lerobot.datasets.dataset_tools import ( delete_episodes, @@ -83,7 +95,8 @@ from lerobot.datasets.dataset_tools import ( split_dataset, ) from lerobot.datasets.lerobot_dataset import LeRobotDataset -from lerobot.utils.constants import HF_LEROBOT_HOME +from lerobot.datasets.video_utils import encode_video_frames +from lerobot.utils.constants import HF_LEROBOT_HOME, OBS_IMAGE from lerobot.utils.utils import init_logging @@ -111,10 +124,24 @@ class RemoveFeatureConfig: feature_names: list[str] | None = None +@dataclass +class ConvertToVideoConfig: + type: str = "convert_to_video" + output_dir: str = "outputs/converted_videos" + vcodec: str = "libsvtav1" + pix_fmt: str = "yuv420p" + g: int = 2 + crf: int = 30 + fast_decode: int = 0 + episode_indices: list[int] | None = None + num_workers: int = 4 + overwrite: bool = False + + @dataclass class EditDatasetConfig: repo_id: str - operation: DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig + operation: DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig | ConvertToVideoConfig root: str | None = None new_repo_id: str | None = None push_to_hub: bool = False @@ -258,6 +285,231 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None: LeRobotDataset(output_repo_id, root=output_dir).push_to_hub() +def save_episode_images( + dataset: LeRobotDataset, + imgs_dir: Path, + episode_index: int = 0, + overwrite: bool = False, + num_workers: int = 4, +) -> None: + """Save images from a specific episode to disk. + + Args: + dataset: The LeRobot dataset to extract images from + imgs_dir: Directory to save images to + episode_index: Index of the episode to save (default: 0) + overwrite: Whether to overwrite existing images + num_workers: Number of threads for parallel image saving (default: 4) + """ + ep_num_images = dataset.meta.episodes["length"][episode_index] + + # Check if images already exist + if not overwrite and imgs_dir.exists() and len(list(imgs_dir.glob("frame-*.png"))) == ep_num_images: + logging.info(f"Images for episode {episode_index} already exist in {imgs_dir}. Skipping.") + return + + # Create directory + imgs_dir.mkdir(parents=True, exist_ok=True) + + # Get dataset without torch format for PIL image access + hf_dataset = dataset.hf_dataset.with_format(None) + + # Get all image keys (for all cameras) + img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)] + + if len(img_keys) == 0: + raise ValueError(f"No image keys found in dataset {dataset.repo_id}") + + # Use first camera only + img_key = img_keys[0] + imgs_dataset = hf_dataset.select_columns(img_key) + + # Get episode start and end indices + from_idx = dataset.meta.episodes["dataset_from_index"][episode_index] + to_idx = dataset.meta.episodes["dataset_to_index"][episode_index] + + # Get all items for this episode + episode_dataset = imgs_dataset.select(range(from_idx, to_idx)) + + # Define function to save a single image + def save_single_image(i_item_tuple): + i, item = i_item_tuple + img = item[img_key] + # Use frame-XXXXXX.png format to match encode_video_frames expectations + img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100) + return i + + # Save images with proper naming convention for encode_video_frames (frame-XXXXXX.png) + # Use ThreadPoolExecutor for parallel processing + items = list(enumerate(episode_dataset)) + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(save_single_image, item) for item in items] + for future in tqdm( + as_completed(futures), + total=len(items), + desc=f"Saving {dataset.repo_id} episode {episode_index} images", + leave=False, + ): + future.result() # This will raise any exceptions that occurred + + +def process_single_episode( + dataset: LeRobotDataset, + episode_index: int, + output_dir: Path, + vcodec: str, + pix_fmt: str, + g: int | None, + crf: int | None, + fast_decode: int, + fps: int, + num_image_workers: int, + overwrite: bool, +) -> str: + """Process a single episode: save images and encode to video. + + Args: + dataset: The LeRobot dataset + episode_index: Index of the episode to process + output_dir: Base directory for outputs + vcodec: Video codec + pix_fmt: Pixel format + g: Group of pictures size + crf: Constant rate factor + fast_decode: Fast decode tuning + fps: Frames per second + num_image_workers: Number of threads for parallel image saving + overwrite: Whether to overwrite existing files + + Returns: + Status message for this episode + """ + # Create paths + imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_") / f"episode_{episode_index:06d}" + + # Create video filename with encoding parameters + video_filename = f"{dataset.repo_id.replace('/', '_')}_ep{episode_index:06d}_{vcodec}_{pix_fmt}_g{g}_crf{crf}.mp4" + video_path = output_dir / "videos" / dataset.repo_id.replace("/", "_") / video_filename + + # Save episode images + save_episode_images(dataset, imgs_dir, episode_index, overwrite, num_image_workers) + + # Encode to video + if overwrite or not video_path.is_file(): + video_path.parent.mkdir(parents=True, exist_ok=True) + + encode_video_frames( + imgs_dir=imgs_dir, + video_path=video_path, + fps=fps, + vcodec=vcodec, + pix_fmt=pix_fmt, + g=g, + crf=crf, + fast_decode=fast_decode, + overwrite=True, + ) + + return f"āœ“ Video saved to {video_path}" + else: + return f"Video already exists: {video_path}. Skipping." + + +def convert_dataset_to_videos( + dataset: LeRobotDataset, + output_dir: Path, + vcodec: str = "libsvtav1", + pix_fmt: str = "yuv420p", + g: int = 2, + crf: int = 30, + fast_decode: int = 0, + episode_indices: list[int] | None = None, + num_workers: int = 4, + overwrite: bool = False, +) -> None: + """Convert dataset images to video files. + + Args: + dataset: The LeRobot dataset + output_dir: Base directory for outputs + vcodec: Video codec (default: libsvtav1) + pix_fmt: Pixel format (default: yuv420p) + g: Group of pictures size (default: 2) + crf: Constant rate factor (default: 30) + fast_decode: Fast decode tuning (default: 0) + episode_indices: List of episode indices to convert (None = all episodes) + num_workers: Number of threads for parallel episode processing (default: 4) + overwrite: Whether to overwrite existing files + """ + # Check that it's an image dataset + if len(dataset.meta.video_keys) > 0: + raise ValueError( + f"This operation is for image datasets only. Video dataset provided: {dataset.repo_id}" + ) + + fps = dataset.fps + + # Determine which episodes to process + num_episodes = len(dataset.meta.episodes) + if episode_indices is None: + episode_indices = list(range(num_episodes)) + + logging.info(f"Processing {len(episode_indices)} episodes from {dataset.repo_id} with {num_workers} workers") + + # Process episodes in parallel + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [ + executor.submit( + process_single_episode, + dataset=dataset, + episode_index=episode_index, + output_dir=output_dir, + vcodec=vcodec, + pix_fmt=pix_fmt, + g=g, + crf=crf, + fast_decode=fast_decode, + fps=fps, + num_image_workers=4, # Use fixed workers for image saving within each episode + overwrite=overwrite, + ) + for episode_index in episode_indices + ] + + for future in tqdm( + as_completed(futures), + total=len(episode_indices), + desc="Episodes", + ): + result = future.result() # This will raise any exceptions that occurred + logging.info(result) + + logging.info(f"\nāœ“ Completed processing {dataset.repo_id}") + + +def handle_convert_to_video(cfg: EditDatasetConfig) -> None: + if not isinstance(cfg.operation, ConvertToVideoConfig): + raise ValueError("Operation config must be ConvertToVideoConfig") + + dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) + output_dir = Path(cfg.operation.output_dir) + + logging.info(f"Converting dataset {cfg.repo_id} to video format") + convert_dataset_to_videos( + dataset=dataset, + output_dir=output_dir, + vcodec=cfg.operation.vcodec, + pix_fmt=cfg.operation.pix_fmt, + g=cfg.operation.g, + crf=cfg.operation.crf, + fast_decode=cfg.operation.fast_decode, + episode_indices=cfg.operation.episode_indices, + num_workers=cfg.operation.num_workers, + overwrite=cfg.operation.overwrite, + ) + + @parser.wrap() def edit_dataset(cfg: EditDatasetConfig) -> None: operation_type = cfg.operation.type @@ -270,10 +522,12 @@ def edit_dataset(cfg: EditDatasetConfig) -> None: handle_merge(cfg) elif operation_type == "remove_feature": handle_remove_feature(cfg) + elif operation_type == "convert_to_video": + handle_convert_to_video(cfg) else: raise ValueError( f"Unknown operation type: {operation_type}\n" - f"Available operations: delete_episodes, split, merge, remove_feature" + f"Available operations: delete_episodes, split, merge, remove_feature, convert_to_video" )