add video encoding tool

2026-07-23 01:41:54 +00:00 · 2025-12-01 13:46:22 +01:00
parent 5f7b5f2817
commit d22fa6446b
2 changed files with 307 additions and 5 deletions
@@ -11,13 +11,14 @@ LeRobot provides several utilities for manipulating datasets:
 3. **Merge Datasets** - Combine multiple datasets into one. The datasets must have identical features, and episodes are concatenated in the order specified in `repo_ids`
 4. **Add Features** - Add new features to a dataset
 5. **Remove Features** - Remove features from a dataset
 6. **Convert to Video** - Convert image-based datasets to video format for efficient storage
 The core implementation is in `lerobot.datasets.dataset_tools`.
 An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`.
 ## Command-Line Tool: lerobot-edit-dataset
-`lerobot-edit-dataset` is a command-line script for editing datasets. It can be used to delete episodes, split datasets, merge datasets, add features, and remove features.
+`lerobot-edit-dataset` is a command-line script for editing datasets. It can be used to delete episodes, split datasets, merge datasets, add features, remove features, and convert image datasets to video format.
 Run `lerobot-edit-dataset --help` for more information on the configuration of each operation.
@@ -86,6 +87,53 @@ lerobot-edit-dataset \
    --operation.feature_names "['observation.images.top']"
 ```
 #### Convert to Video
 Convert an image-based dataset to video format. This is useful for reducing storage requirements and improving data loading performance. Videos are encoded with configurable quality settings.
 ```bash
 # Convert all episodes to video format with default settings
 lerobot-edit-dataset \
    --repo_id lerobot/pusht_image \
    --operation.type convert_to_video \
    --operation.output_dir outputs/converted_videos
 # Convert with custom video codec and quality settings
 lerobot-edit-dataset \
    --repo_id lerobot/pusht_image \
    --operation.type convert_to_video \
    --operation.output_dir outputs/converted_videos \
    --operation.vcodec libsvtav1 \
    --operation.pix_fmt yuv420p \
    --operation.g 2 \
    --operation.crf 30
 # Convert only specific episodes
 lerobot-edit-dataset \
    --repo_id lerobot/pusht_image \
    --operation.type convert_to_video \
    --operation.output_dir outputs/converted_videos \
    --operation.episode_indices "[0, 1, 2, 5, 10]"
 # Convert with multiple workers for parallel processing
 lerobot-edit-dataset \
    --repo_id lerobot/pusht_image \
    --operation.type convert_to_video \
    --operation.output_dir outputs/converted_videos \
    --operation.num_workers 8
 ```
 **Parameters:**
 - `output_dir`: Directory where videos will be saved (default: `outputs/converted_videos`)
 - `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`)
 - `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`)
 - `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2)
 - `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30)
 - `fast_decode`: Fast decode tuning option (default: 0)
 - `episode_indices`: List of specific episodes to convert (default: all episodes)
 - `num_workers`: Number of parallel workers for processing (default: 4)
 - `overwrite`: Overwrite existing video files if they exist
 ### Push to Hub
 Add the `--push_to_hub` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub:
@@ -18,7 +18,8 @@
 Edit LeRobot datasets using various transformation tools.
 This script allows you to delete episodes, split datasets, merge datasets,
-and remove features. When new_repo_id is specified, creates a new dataset.
+remove features, and convert image datasets to video format. 
 When new_repo_id is specified, creates a new dataset.
 Usage Examples:
@@ -65,6 +66,14 @@ Remove camera feature:
        --operation.type remove_feature \
        --operation.feature_names "['observation.images.top']"
 Convert image dataset to video format:
    python -m lerobot.scripts.lerobot_edit_dataset \
        --repo_id lerobot/pusht_image \
        --operation.type convert_to_video \
        --operation.output_dir outputs/converted_videos \
        --operation.vcodec libsvtav1 \
        --operation.crf 30
 Using JSON config file:
    python -m lerobot.scripts.lerobot_edit_dataset \
        --config_path path/to/edit_config.json
@@ -72,9 +81,12 @@ Using JSON config file:
 import logging
 import shutil
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from tqdm import tqdm
 from lerobot.configs import parser
 from lerobot.datasets.dataset_tools import (
    delete_episodes,
@@ -83,7 +95,8 @@ from lerobot.datasets.dataset_tools import (
    split_dataset,
 )
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.utils.constants import HF_LEROBOT_HOME
+from lerobot.datasets.video_utils import encode_video_frames
 from lerobot.utils.constants import HF_LEROBOT_HOME, OBS_IMAGE
 from lerobot.utils.utils import init_logging
@@ -111,10 +124,24 @@ class RemoveFeatureConfig:
    feature_names: list[str] | None = None
@dataclass
 class ConvertToVideoConfig:
    type: str = "convert_to_video"
    output_dir: str = "outputs/converted_videos"
    vcodec: str = "libsvtav1"
    pix_fmt: str = "yuv420p"
    g: int = 2
    crf: int = 30
    fast_decode: int = 0
    episode_indices: list[int] | None = None
    num_workers: int = 4
    overwrite: bool = False
@dataclass
 class EditDatasetConfig:
    repo_id: str
-    operation: DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig
+    operation: DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig | ConvertToVideoConfig
    root: str | None = None
    new_repo_id: str | None = None
    push_to_hub: bool = False
@@ -258,6 +285,231 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None:
        LeRobotDataset(output_repo_id, root=output_dir).push_to_hub()
 def save_episode_images(
    dataset: LeRobotDataset,
    imgs_dir: Path,
    episode_index: int = 0,
    overwrite: bool = False,
    num_workers: int = 4,
 ) -> None:
    """Save images from a specific episode to disk.
    Args:
        dataset: The LeRobot dataset to extract images from
        imgs_dir: Directory to save images to
        episode_index: Index of the episode to save (default: 0)
        overwrite: Whether to overwrite existing images
        num_workers: Number of threads for parallel image saving (default: 4)
    """
    ep_num_images = dataset.meta.episodes["length"][episode_index]
    # Check if images already exist
    if not overwrite and imgs_dir.exists() and len(list(imgs_dir.glob("frame-*.png"))) == ep_num_images:
        logging.info(f"Images for episode {episode_index} already exist in {imgs_dir}. Skipping.")
        return
    # Create directory
    imgs_dir.mkdir(parents=True, exist_ok=True)
    # Get dataset without torch format for PIL image access
    hf_dataset = dataset.hf_dataset.with_format(None)
    # Get all image keys (for all cameras)
    img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)]
    if len(img_keys) == 0:
        raise ValueError(f"No image keys found in dataset {dataset.repo_id}")
    # Use first camera only
    img_key = img_keys[0]
    imgs_dataset = hf_dataset.select_columns(img_key)
    # Get episode start and end indices
    from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
    to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
    # Get all items for this episode
    episode_dataset = imgs_dataset.select(range(from_idx, to_idx))
    # Define function to save a single image
    def save_single_image(i_item_tuple):
        i, item = i_item_tuple
        img = item[img_key]
        # Use frame-XXXXXX.png format to match encode_video_frames expectations
        img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100)
        return i
    # Save images with proper naming convention for encode_video_frames (frame-XXXXXX.png)
    # Use ThreadPoolExecutor for parallel processing
    items = list(enumerate(episode_dataset))
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(save_single_image, item) for item in items]
        for future in tqdm(
            as_completed(futures),
            total=len(items),
            desc=f"Saving {dataset.repo_id} episode {episode_index} images",
            leave=False,
        ):
            future.result()  # This will raise any exceptions that occurred
 def process_single_episode(
    dataset: LeRobotDataset,
    episode_index: int,
    output_dir: Path,
    vcodec: str,
    pix_fmt: str,
    g: int | None,
    crf: int | None,
    fast_decode: int,
    fps: int,
    num_image_workers: int,
    overwrite: bool,
 ) -> str:
    """Process a single episode: save images and encode to video.
    Args:
        dataset: The LeRobot dataset
        episode_index: Index of the episode to process
        output_dir: Base directory for outputs
        vcodec: Video codec
        pix_fmt: Pixel format
        g: Group of pictures size
        crf: Constant rate factor
        fast_decode: Fast decode tuning
        fps: Frames per second
        num_image_workers: Number of threads for parallel image saving
        overwrite: Whether to overwrite existing files
    Returns:
        Status message for this episode
    """
    # Create paths
    imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_") / f"episode_{episode_index:06d}"
    # Create video filename with encoding parameters
    video_filename = f"{dataset.repo_id.replace('/', '_')}_ep{episode_index:06d}_{vcodec}_{pix_fmt}_g{g}_crf{crf}.mp4"
    video_path = output_dir / "videos" / dataset.repo_id.replace("/", "_") / video_filename
    # Save episode images
    save_episode_images(dataset, imgs_dir, episode_index, overwrite, num_image_workers)
    # Encode to video
    if overwrite or not video_path.is_file():
        video_path.parent.mkdir(parents=True, exist_ok=True)
        encode_video_frames(
            imgs_dir=imgs_dir,
            video_path=video_path,
            fps=fps,
            vcodec=vcodec,
            pix_fmt=pix_fmt,
            g=g,
            crf=crf,
            fast_decode=fast_decode,
            overwrite=True,
        )
        return f"✓ Video saved to {video_path}"
    else:
        return f"Video already exists: {video_path}. Skipping."
 def convert_dataset_to_videos(
    dataset: LeRobotDataset,
    output_dir: Path,
    vcodec: str = "libsvtav1",
    pix_fmt: str = "yuv420p",
    g: int = 2,
    crf: int = 30,
    fast_decode: int = 0,
    episode_indices: list[int] | None = None,
    num_workers: int = 4,
    overwrite: bool = False,
 ) -> None:
    """Convert dataset images to video files.
    Args:
        dataset: The LeRobot dataset
        output_dir: Base directory for outputs
        vcodec: Video codec (default: libsvtav1)
        pix_fmt: Pixel format (default: yuv420p)
        g: Group of pictures size (default: 2)
        crf: Constant rate factor (default: 30)
        fast_decode: Fast decode tuning (default: 0)
        episode_indices: List of episode indices to convert (None = all episodes)
        num_workers: Number of threads for parallel episode processing (default: 4)
        overwrite: Whether to overwrite existing files
    """
    # Check that it's an image dataset
    if len(dataset.meta.video_keys) > 0:
        raise ValueError(
            f"This operation is for image datasets only. Video dataset provided: {dataset.repo_id}"
        )
    fps = dataset.fps
    # Determine which episodes to process
    num_episodes = len(dataset.meta.episodes)
    if episode_indices is None:
        episode_indices = list(range(num_episodes))
    logging.info(f"Processing {len(episode_indices)} episodes from {dataset.repo_id} with {num_workers} workers")
    # Process episodes in parallel
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [
            executor.submit(
                process_single_episode,
                dataset=dataset,
                episode_index=episode_index,
                output_dir=output_dir,
                vcodec=vcodec,
                pix_fmt=pix_fmt,
                g=g,
                crf=crf,
                fast_decode=fast_decode,
                fps=fps,
                num_image_workers=4,  # Use fixed workers for image saving within each episode
                overwrite=overwrite,
            )
            for episode_index in episode_indices
        ]
        for future in tqdm(
            as_completed(futures),
            total=len(episode_indices),
            desc="Episodes",
        ):
            result = future.result()  # This will raise any exceptions that occurred
            logging.info(result)
    logging.info(f"\n✓ Completed processing {dataset.repo_id}")
 def handle_convert_to_video(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, ConvertToVideoConfig):
        raise ValueError("Operation config must be ConvertToVideoConfig")
    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
    output_dir = Path(cfg.operation.output_dir)
    logging.info(f"Converting dataset {cfg.repo_id} to video format")
    convert_dataset_to_videos(
        dataset=dataset,
        output_dir=output_dir,
        vcodec=cfg.operation.vcodec,
        pix_fmt=cfg.operation.pix_fmt,
        g=cfg.operation.g,
        crf=cfg.operation.crf,
        fast_decode=cfg.operation.fast_decode,
        episode_indices=cfg.operation.episode_indices,
        num_workers=cfg.operation.num_workers,
        overwrite=cfg.operation.overwrite,
    )
@parser.wrap()
 def edit_dataset(cfg: EditDatasetConfig) -> None:
    operation_type = cfg.operation.type
@@ -270,10 +522,12 @@ def edit_dataset(cfg: EditDatasetConfig) -> None:
        handle_merge(cfg)
    elif operation_type == "remove_feature":
        handle_remove_feature(cfg)
    elif operation_type == "convert_to_video":
        handle_convert_to_video(cfg)
    else:
        raise ValueError(
            f"Unknown operation type: {operation_type}\n"
-            f"Available operations: delete_episodes, split, merge, remove_feature"
+            f"Available operations: delete_episodes, split, merge, remove_feature, convert_to_video"
        )