diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py index 95ff68e23..d6a87bce9 100644 --- a/src/lerobot/configs/video.py +++ b/src/lerobot/configs/video.py @@ -16,6 +16,8 @@ """Video encoder configurations.""" +from __future__ import annotations + import logging from dataclasses import dataclass, field from typing import Any @@ -35,6 +37,9 @@ HW_VIDEO_CODECS = [ "h264_qsv", # Intel Quick Sync ] VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS}) +# Aliases for legacy video codec names. +VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"} + LIBSVTAV1_DEFAULT_PRESET: int = 12 @@ -88,6 +93,30 @@ class VideoEncoderConfig: self.preset = LIBSVTAV1_DEFAULT_PRESET self.validate() + @classmethod + def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig: + """Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block. + Missing or ``None`` values fall back to the class defaults. + """ + video_info = video_info or {} + kwargs: dict[str, Any] = {} + + for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")): + value = video_info.get(src_key) + if value is not None: + kwargs[dst_field] = value + + for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES: + value = video_info.get(f"video.{field_name}") + if value is None: + continue + # Persisted as ``{}`` after merges with disagreeing sources — treat as default. + if field_name == "extra_options" and not value: + continue + kwargs[field_name] = value + + return cls(**kwargs) + def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: """Return the subset of available encoders based on the specified video backend. @@ -116,7 +145,11 @@ class VideoEncoderConfig: For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``. + + Stream-derived canonical codec names listed in :data:`VIDEO_CODEC_ALIASES` are + rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``). """ + self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec) if self.vcodec not in VALID_VIDEO_CODECS: raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") if self.vcodec == "auto": diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 438f3c0b4..489914fbc 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -96,17 +96,19 @@ def delete_episodes( episode_indices: list[int], output_dir: str | Path | None = None, repo_id: str | None = None, - camera_encoder: VideoEncoderConfig | None = None, ) -> LeRobotDataset: """Delete episodes from a LeRobotDataset and create a new dataset. + Video segments that need re-encoding (because the source file mixes kept and + deleted episodes) are re-encoded with the source dataset's existing encoder + settings — read back from ``meta/info.json`` — so the output dataset stays + consistent with its own metadata. + Args: dataset: The source LeRobotDataset. episode_indices: List of episode indices to delete. output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. - camera_encoder: Video encoder settings used when re-encoding video segments - (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). """ if not episode_indices: raise ValueError("No episodes to delete") @@ -139,7 +141,7 @@ def delete_episodes( video_metadata = None if dataset.meta.video_keys: - video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder) + video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) @@ -161,17 +163,19 @@ def split_dataset( dataset: LeRobotDataset, splits: dict[str, float | list[int]], output_dir: str | Path | None = None, - camera_encoder: VideoEncoderConfig | None = None, ) -> dict[str, LeRobotDataset]: """Split a LeRobotDataset into multiple smaller datasets. + Video segments that need re-encoding (because the source file mixes episodes + that fall into different splits) are re-encoded with the source dataset's + existing encoder settings — read back from ``meta/info.json`` — so each + output split stays consistent with its own metadata. + Args: dataset: The source LeRobotDataset to split. splits: Either a dict mapping split names to episode indices, or a dict mapping split names to fractions (must sum to <= 1.0). output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. - camera_encoder: Video encoder settings used when re-encoding video segments - (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). Examples: Split by specific episodes @@ -232,7 +236,7 @@ def split_dataset( video_metadata = None if dataset.meta.video_keys: - video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder) + video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) @@ -588,7 +592,7 @@ def _keep_episodes_from_video_with_av( output_path: Path, episodes_to_keep: list[tuple[int, int]], fps: float, - camera_encoder: VideoEncoderConfig | None = None, + camera_encoder: VideoEncoderConfig, ) -> None: """Keep only specified episodes from a video file using PyAV. @@ -602,11 +606,8 @@ def _keep_episodes_from_video_with_av( Ranges are half-open intervals: [start_frame, end_frame), where start_frame is inclusive and end_frame is exclusive. fps: Frame rate of the video. - camera_encoder: Video encoder settings - (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). + camera_encoder: Video encoder settings used to re-encode the kept frames. """ - if camera_encoder is None: - camera_encoder = camera_encoder_defaults() from fractions import Fraction import av @@ -699,26 +700,23 @@ def _copy_and_reindex_videos( src_dataset: LeRobotDataset, dst_meta: LeRobotDatasetMetadata, episode_mapping: dict[int, int], - camera_encoder: VideoEncoderConfig | None = None, ) -> dict[int, dict]: """Copy and filter video files, only re-encoding files with deleted episodes. For video files that only contain kept episodes, we copy them directly. For files with mixed kept/deleted episodes, we use PyAV filters to efficiently - re-encode only the desired segments. + re-encode only the desired segments. The encoder used for re-encoding is + derived per video key from the source dataset's ``meta/info.json`` so the + destination metadata keeps describing the videos accurately. Args: src_dataset: Source dataset to copy from dst_meta: Destination metadata object episode_mapping: Mapping from old episode indices to new indices - camera_encoder: Video encoder settings used when re-encoding segments - (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). Returns: dict mapping episode index to its video metadata (chunk_index, file_index, timestamps) """ - if camera_encoder is None: - camera_encoder = camera_encoder_defaults() if src_dataset.meta.episodes is None: src_dataset.meta.episodes = load_episodes(src_dataset.meta.root) @@ -726,6 +724,9 @@ def _copy_and_reindex_videos( for video_key in src_dataset.meta.video_keys: logging.info(f"Processing videos for {video_key}") + camera_encoder = VideoEncoderConfig.from_video_info( + src_dataset.meta.info.features.get(video_key, {}).get("info") + ) if dst_meta.video_path is None: raise ValueError("Destination metadata has no video_path defined") diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index 9107a29c7..eb6a57870 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -49,14 +49,6 @@ Delete episodes and save to a new dataset at a specific path and with a new repo --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" -Delete episodes and re-encode video segments with h264: - lerobot-edit-dataset \ - --repo_id lerobot/pusht \ - --operation.type delete_episodes \ - --operation.episode_indices "[0, 2, 5]" \ - --operation.camera_encoder.vcodec h264 \ - --operation.camera_encoder.crf 23 - Split dataset by fractions (pusht_train, pusht_val): lerobot-edit-dataset \ --repo_id lerobot/pusht \ @@ -82,14 +74,6 @@ Split into more than two splits: --operation.type split \ --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}' -Split dataset and re-encode video segments with h264: - lerobot-edit-dataset \ - --repo_id lerobot/pusht \ - --operation.type split \ - --operation.splits '{"train": 0.8, "val": 0.2}' \ - --operation.camera_encoder.vcodec h264 \ - --operation.camera_encoder.crf 23 - Merge multiple datasets: lerobot-edit-dataset \ --new_repo_id lerobot/pusht_merged \ @@ -234,14 +218,12 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC): @dataclass class DeleteEpisodesConfig(OperationConfig): episode_indices: list[int] | None = None - camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) @OperationConfig.register_subclass("split") @dataclass class SplitConfig(OperationConfig): splits: dict[str, float | list[int]] | None = None - camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) @OperationConfig.register_subclass("merge") @@ -370,7 +352,6 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None: episode_indices=cfg.operation.episode_indices, output_dir=output_dir, repo_id=output_repo_id, - camera_encoder=cfg.operation.camera_encoder, ) logging.info(f"Dataset saved to {output_dir}") @@ -402,7 +383,6 @@ def handle_split(cfg: EditDatasetConfig) -> None: dataset, splits=cfg.operation.splits, output_dir=cfg.new_root, - camera_encoder=cfg.operation.camera_encoder, ) for split_name, split_ds in split_datasets.items(): diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py index a9529d1d8..9c2fbf87f 100644 --- a/tests/datasets/test_video_encoding.py +++ b/tests/datasets/test_video_encoding.py @@ -36,6 +36,7 @@ from lerobot.datasets.video_utils import ( encode_video_frames, get_video_info, ) +from tests.fixtures.constants import DUMMY_VIDEO_INFO # Per-codec skip markers — validation tests only fire when the codec is available @@ -570,3 +571,25 @@ class TestEncoderConfigPersistence: dataset.finalize() assert _read_feature_info(dataset) == first_info + + +class TestFromVideoInfo: + """``VideoEncoderConfig.from_video_info`` reconstructs an encoder config + from the ``video.*`` keys persisted in a dataset's ``info.json``. + """ + + @require_libsvtav1 + def test_reconstructs_from_dummy_video_info(self): + cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO) + + # Canonical stream codec ``"av1"`` is aliased to the encoder name. + assert cfg.vcodec == "libsvtav1" + assert cfg.pix_fmt == DUMMY_VIDEO_INFO["video.pix_fmt"] + assert cfg.g == DUMMY_VIDEO_INFO["video.g"] + assert cfg.crf == DUMMY_VIDEO_INFO["video.crf"] + assert cfg.preset == DUMMY_VIDEO_INFO["video.preset"] + assert cfg.fast_decode == DUMMY_VIDEO_INFO["video.fast_decode"] + assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"] + # ``{}`` placeholder (typical after a merge with disagreeing sources) + # must not leak into the reconstructed config. + assert cfg.extra_options == VideoEncoderConfig().extra_options