diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py index ab74c3cd3..c3fe246cd 100644 --- a/src/lerobot/configs/__init__.py +++ b/src/lerobot/configs/__init__.py @@ -31,6 +31,12 @@ from .types import ( PolicyFeature, RTCAttentionSchedule, ) +from .video import ( + VALID_VIDEO_CODECS, + VIDEO_ENCODER_INFO_KEYS, + VideoEncoderConfig, + camera_encoder_defaults, +) __all__ = [ # Types @@ -46,4 +52,10 @@ __all__ = [ "PeftConfig", "PreTrainedConfig", "WandBConfig", + "VideoEncoderConfig", + # Defaults + "camera_encoder_defaults", + # Constants + "VALID_VIDEO_CODECS", + "VIDEO_ENCODER_INFO_KEYS", ] diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py index 406488a7e..24154eecb 100644 --- a/src/lerobot/configs/dataset.py +++ b/src/lerobot/configs/dataset.py @@ -18,7 +18,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from lerobot.datasets.video_utils import VideoEncoderConfig, camera_encoder_defaults +from .video import VideoEncoderConfig, camera_encoder_defaults @dataclass diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py new file mode 100644 index 000000000..292225ec4 --- /dev/null +++ b/src/lerobot/configs/video.py @@ -0,0 +1,202 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Note: We subclass str so that serialization is straightforward +# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json + +"""Video encoder configurations.""" + +import logging +from dataclasses import dataclass, field +from typing import Any + +from lerobot.utils.import_utils import require_package + +logger = logging.getLogger(__name__) + +# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend. +# Determines the order of preference for auto-selection when vcodec="auto" is used. +HW_VIDEO_CODECS = [ + "h264_videotoolbox", # macOS + "hevc_videotoolbox", # macOS + "h264_nvenc", # NVIDIA GPU + "hevc_nvenc", # NVIDIA GPU + "h264_vaapi", # Linux Intel/AMD + "h264_qsv", # Intel Quick Sync +] +VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS}) + +LIBSVTAV1_DEFAULT_PRESET: int = 12 + +# Keys persisted under ``features[*]["info"]`` as ``video.`` (from :class:`VideoEncoderConfig`). +# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly. +VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset( + {"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"} +) +VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset( + f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES +) + + +@dataclass +class VideoEncoderConfig: + """Video encoder configuration. + + Attributes: + vcodec: Video encoder name. ``"auto"`` is resolved during + construction (HW encoder if available, else ``libsvtav1``). + pix_fmt: Pixel format (e.g. ``"yuv420p"``). + g: GOP size (keyframe interval). + crf: Quality level — mapped to the native quality parameter of the + codec (``crf`` for software, ``qp`` for NVENC/VAAPI, + ``q:v`` for VideoToolbox, ``global_quality`` for QSV). + preset: Speed/quality preset. Accepted type is per-codec. + fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2) + embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values + set ``tune=fastdecode``. Ignored for other codecs. + video_backend: Python to be used for encoding. Only ``"pyav"`` + is currently supported. + extra_options: Free-form dictionary of additional video encoder options + (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``). + """ + + vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ? + pix_fmt: str = "yuv420p" + g: int | None = 2 + crf: int | None = 30 + preset: int | str | None = None + fast_decode: int = 0 + # TODO(CarolinePascal): add torchcodec support + find a way to unify the + # two backends (encoding and decoding). + video_backend: str = "pyav" + extra_options: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + self.resolve_vcodec() + # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work". + if self.preset is None and self.vcodec == "libsvtav1": + self.preset = LIBSVTAV1_DEFAULT_PRESET + self.validate() + + def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: + """Return the subset of available encoders based on the specified video backend. + + Args: + encoders: List of encoder names to detect. If a string, it is converted to a list. + Returns: + List of available encoder names. If the video backend is not "pyav", returns an empty list. + """ + if self.video_backend == "pyav": + require_package("av", extra="dataset") + from lerobot.datasets.pyav_utils import detect_available_encoders_pyav + + return detect_available_encoders_pyav(encoders) + return [] + + def validate(self) -> None: + """Validate the video encoder configuration.""" + if self.video_backend == "pyav": + require_package("av", extra="dataset") + from lerobot.datasets.pyav_utils import check_video_encoder_config_pyav + + check_video_encoder_config_pyav(self) + + def resolve_vcodec(self) -> None: + """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder. + + For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the + resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``. + """ + if self.vcodec not in VALID_VIDEO_CODECS: + raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") + if self.vcodec == "auto": + available = self.detect_available_encoders(HW_VIDEO_CODECS) + for encoder in HW_VIDEO_CODECS: + if encoder in available: + logger.info(f"Auto-selected video codec: {encoder}") + self.vcodec = encoder + return + logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'") + self.vcodec = "libsvtav1" + + if self.detect_available_encoders(self.vcodec): + logger.info(f"Using video codec: {self.vcodec}") + return + raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}") + + def get_codec_options( + self, encoder_threads: int | None = None, as_strings: bool = False + ) -> dict[str, Any]: + """Translate the tuning fields to codec-specific options. + + ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field. + + Args: + encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs. + For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``. + For h264/hevc, this is mapped to ``threads``. + Hardware encoders ignore this parameter. + as_strings: If ``True``, casts values to strings. + """ + opts: dict[str, Any] = {} + + def set_if(key: str, value: Any) -> None: + if value is not None: + opts[key] = value if not as_strings else str(value) + + # GOP size is not a codec-specific option, so it is always set. + set_if("g", self.g) + + if self.vcodec == "libsvtav1": + set_if("crf", self.crf) + set_if("preset", self.preset) + svtav1_parts: list[str] = [] + if self.fast_decode is not None: + svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}") + if encoder_threads is not None: + svtav1_parts.append(f"lp={encoder_threads}") + if svtav1_parts: + opts["svtav1-params"] = ":".join(svtav1_parts) + elif self.vcodec in ("h264", "hevc"): + set_if("crf", self.crf) + set_if("preset", self.preset) + if self.fast_decode: + opts["tune"] = "fastdecode" + set_if("threads", encoder_threads) + elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"): + if self.crf is not None: + opts["q:v"] = max(1, min(100, 100 - self.crf * 2)) + elif self.vcodec in ("h264_nvenc", "hevc_nvenc"): + opts["rc"] = "constqp" + set_if("qp", self.crf) + set_if("preset", self.preset) + elif self.vcodec == "h264_vaapi": + set_if("qp", self.crf) + elif self.vcodec == "h264_qsv": + set_if("global_quality", self.crf) + set_if("preset", self.preset) + else: + set_if("crf", self.crf) + set_if("preset", self.preset) + + # Extra options are merged last but never override structured fields (values are kept as given). + for k, v in self.extra_options.items(): + if k not in opts: + set_if(k, v) + + return opts + + +def camera_encoder_defaults() -> VideoEncoderConfig: + """Return a :class:`VideoEncoderConfig` with RGB-camera defaults.""" + return VideoEncoderConfig() diff --git a/src/lerobot/datasets/__init__.py b/src/lerobot/datasets/__init__.py index d916ec7bb..ae2b00205 100644 --- a/src/lerobot/datasets/__init__.py +++ b/src/lerobot/datasets/__init__.py @@ -40,19 +40,9 @@ from .io_utils import load_episodes, write_stats from .lerobot_dataset import LeRobotDataset from .multi_dataset import MultiLeRobotDataset from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features -from .pyav_utils import ( - check_video_encoder_config_pyav, - detect_available_encoders_pyav, - get_codec, -) from .sampler import EpisodeAwareSampler from .streaming_dataset import StreamingLeRobotDataset from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card -from .video_utils import ( - VideoEncoderConfig, - VideoEncodingManager, - camera_encoder_defaults, -) # NOTE: Low-level I/O functions (cast_stats_to_numpy, get_parquet_file_size_in_mb, etc.) # and legacy migration constants are intentionally NOT re-exported here. @@ -67,20 +57,14 @@ __all__ = [ "LeRobotDatasetMetadata", "MultiLeRobotDataset", "StreamingLeRobotDataset", - "VideoEncoderConfig", - "VideoEncodingManager", - "camera_encoder_defaults", "add_features", "aggregate_datasets", "aggregate_pipeline_dataset_features", "aggregate_stats", - "check_video_encoder_config_pyav", "convert_image_to_video_dataset", "create_initial_features", "create_lerobot_dataset_card", "delete_episodes", - "detect_available_encoders_pyav", - "get_codec", "get_feature_stats", "load_episodes", "make_dataset", diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py index a4ad60e5e..7adeb8c1a 100644 --- a/src/lerobot/datasets/pyav_utils.py +++ b/src/lerobot/datasets/pyav_utils.py @@ -23,12 +23,11 @@ from __future__ import annotations import functools import logging -from typing import TYPE_CHECKING, Any +from typing import Any import av -if TYPE_CHECKING: - from .video_utils import VideoEncoderConfig +from lerobot.configs.video import VideoEncoderConfig logger = logging.getLogger(__name__) @@ -167,7 +166,7 @@ def check_video_encoder_config_pyav(config: VideoEncoderConfig) -> None: """Verify *config* is compatible with the bundled FFmpeg build. Checks pixel format, abstract tuning-field compatibility, and each merged - encoder option from :meth:`~lerobot.datasets.video_utils.VideoEncoderConfig.get_codec_options` + encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options` against PyAV (including numeric ``extra_options`` present in that dict). No-op when ``config.vcodec`` isn't in the local FFmpeg build. diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index 610609613..5bebd8cf1 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -36,177 +36,14 @@ import torch from datasets.features.features import register_feature from PIL import Image +from lerobot.configs.video import ( + VideoEncoderConfig, + camera_encoder_defaults, +) from lerobot.utils.import_utils import get_safe_default_video_backend -from .pyav_utils import ( - check_video_encoder_config_pyav, - detect_available_encoders_pyav, -) - logger = logging.getLogger(__name__) -# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build. -# Determines the order of preference for auto-selection when vcodec="auto" is used. -HW_VIDEO_CODECS = [ - "h264_videotoolbox", # macOS - "hevc_videotoolbox", # macOS - "h264_nvenc", # NVIDIA GPU - "hevc_nvenc", # NVIDIA GPU - "h264_vaapi", # Linux Intel/AMD - "h264_qsv", # Intel Quick Sync -] - -VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_VIDEO_CODECS) - -LIBSVTAV1_DEFAULT_PRESET: int = 12 - - -@dataclass -class VideoEncoderConfig: - """Video encoder configuration. - - Attributes: - vcodec: FFmpeg encoder name. ``"auto"`` is resolved during - construction (HW encoder if available, else ``libsvtav1``). - pix_fmt: Pixel format (e.g. ``"yuv420p"``). - g: GOP size (keyframe interval). - crf: Quality level — mapped to the native quality parameter of the - codec (``crf`` for software, ``qp`` for NVENC/VAAPI, - ``q:v`` for VideoToolbox, ``global_quality`` for QSV). - preset: Speed/quality preset. Accepted type is per-codec. - fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2) - embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values - set ``tune=fastdecode``. Ignored for other codecs. - video_backend: Python library driving FFmpeg for encoding. Only ``"pyav"`` - is currently supported. - extra_options: Free-form dictionary of additional FFmpeg options - (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``). - """ - - vcodec: str = "libsvtav1" - pix_fmt: str = "yuv420p" - g: int | None = 2 - crf: int | None = 30 - preset: int | str | None = None - fast_decode: int = 0 - # TODO(CarolinePascal): add torchcodec support + find a way to unify the - # two backends (encoding and decoding). - video_backend: str = "pyav" - extra_options: dict[str, Any] = field(default_factory=dict) - - def __post_init__(self) -> None: - self.resolve_vcodec() - - # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work". - if self.preset is None and self.vcodec == "libsvtav1": - self.preset = LIBSVTAV1_DEFAULT_PRESET - - self.validate() - - def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: - """Detect available encoders based on the video backend.""" - if self.video_backend == "pyav": - return detect_available_encoders_pyav(encoders) - else: - return [] - - def validate(self) -> None: - """Validate the video encoder config.""" - if self.video_backend == "pyav": - check_video_encoder_config_pyav(self) - - def resolve_vcodec(self) -> None: - """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder. - - For ``"auto"``, the first hardware encoder in the preference list that FFmpeg - exposes is chosen; if none are available, ``libsvtav1`` is used. If the - resolved codec (explicit or after auto-selection) is not present in the - local FFmpeg build, raises ``ValueError``. - """ - if self.vcodec not in VALID_VIDEO_CODECS: - raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") - if self.vcodec == "auto": - available = self.detect_available_encoders(HW_VIDEO_CODECS) - for encoder in HW_VIDEO_CODECS: - if encoder in available: - logger.info(f"Auto-selected video codec: {encoder}") - self.vcodec = encoder - return - logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'") - self.vcodec = "libsvtav1" - - if self.detect_available_encoders(self.vcodec): - logger.info(f"Using video codec: {self.vcodec}") - return - raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}") - - def get_codec_options( - self, encoder_threads: int | None = None, as_strings: bool = False - ) -> dict[str, Any]: - """Translate the tuning fields to codec-specific FFmpeg options. - - ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field. - - Args: - encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs. - For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``. - For h264/hevc, this is mapped to ``threads``. - Hardware encoders ignore this parameter. - as_strings: If ``True``, casts values to strings. - """ - opts: dict[str, Any] = {} - - def set_if(key: str, value: Any) -> None: - if value is not None: - opts[key] = value if not as_strings else str(value) - - # GOP size is not a codec-specific option, so it is always set. - set_if("g", self.g) - - if self.vcodec == "libsvtav1": - set_if("crf", self.crf) - set_if("preset", self.preset) - svtav1_parts: list[str] = [] - if self.fast_decode is not None: - svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}") - if encoder_threads is not None: - svtav1_parts.append(f"lp={encoder_threads}") - if svtav1_parts: - opts["svtav1-params"] = ":".join(svtav1_parts) - elif self.vcodec in ("h264", "hevc"): - set_if("crf", self.crf) - set_if("preset", self.preset) - if self.fast_decode: - opts["tune"] = "fastdecode" - set_if("threads", encoder_threads) - elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"): - if self.crf is not None: - opts["q:v"] = max(1, min(100, 100 - self.crf * 2)) - elif self.vcodec in ("h264_nvenc", "hevc_nvenc"): - opts["rc"] = "constqp" - set_if("qp", self.crf) - set_if("preset", self.preset) - elif self.vcodec == "h264_vaapi": - set_if("qp", self.crf) - elif self.vcodec == "h264_qsv": - set_if("global_quality", self.crf) - set_if("preset", self.preset) - else: - set_if("crf", self.crf) - set_if("preset", self.preset) - - # Extra options are merged last but never override structured fields (values are kept as given). - for k, v in self.extra_options.items(): - if k not in opts: - set_if(k, v) - - return opts - - -def camera_encoder_defaults() -> VideoEncoderConfig: - """Return a :class:`VideoEncoderConfig` with RGB-camera defaults.""" - return VideoEncoderConfig() - def decode_video_frames( video_path: Path | str, diff --git a/src/lerobot/utils/import_utils.py b/src/lerobot/utils/import_utils.py index a7e8c8b59..ef03367eb 100644 --- a/src/lerobot/utils/import_utils.py +++ b/src/lerobot/utils/import_utils.py @@ -128,6 +128,9 @@ _hidapi_available = is_package_available("hidapi", import_name="hid") _pandas_available = is_package_available("pandas") _faker_available = is_package_available("faker") +# Video encoding / decoding +_av_available = is_package_available("av") + # Misc _pynput_available = is_package_available("pynput") _pygame_available = is_package_available("pygame") diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 90a1aa5dc..36aa3f6f3 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -31,6 +31,7 @@ from torchvision.transforms import v2 from lerobot.configs.default import DatasetConfig from lerobot.configs.train import TrainPipelineConfig +from lerobot.configs.video import VALID_VIDEO_CODECS, VideoEncoderConfig from lerobot.datasets import make_dataset from lerobot.datasets.feature_utils import get_hf_features_from_features from lerobot.datasets.image_writer import image_array_to_pil_image @@ -43,7 +44,6 @@ from lerobot.datasets.utils import ( DEFAULT_VIDEO_FILE_SIZE_IN_MB, create_branch, ) -from lerobot.datasets.video_utils import VALID_VIDEO_CODECS, VideoEncoderConfig from lerobot.envs.factory import make_env_config from lerobot.policies.factory import make_policy_config from lerobot.robots import make_robot_from_config diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py index 7332ab357..3ac4269fc 100644 --- a/tests/datasets/test_video_encoding.py +++ b/tests/datasets/test_video_encoding.py @@ -26,13 +26,12 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])") import av # noqa: E402 +from lerobot.configs.video import VALID_VIDEO_CODECS, VideoEncoderConfig from lerobot.datasets.image_writer import write_image from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.pyav_utils import get_codec from lerobot.datasets.utils import INFO_PATH from lerobot.datasets.video_utils import ( - VALID_VIDEO_CODECS, - VideoEncoderConfig, concatenate_video_files, encode_video_frames, get_video_info,