diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py index ca9dc12f5..4f18b5dec 100644 --- a/src/lerobot/datasets/pyav_utils.py +++ b/src/lerobot/datasets/pyav_utils.py @@ -33,21 +33,7 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE") - -# Abstract tuning fields of VideoEncoderConfig -TUNING_FIELDS: tuple[str, ...] = ("g", "crf", "preset", "fast_decode") - -# Codec-specific FFmpeg private option whose value is controlled by the -# abstract ``crf`` tuning field. -CRF_OPTION_BY_CODEC: dict[str, str] = { - "libsvtav1": "crf", - "h264": "crf", - "hevc": "crf", - "h264_nvenc": "qp", - "hevc_nvenc": "qp", - "h264_vaapi": "qp", - "h264_qsv": "global_quality", -} +FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64") @functools.cache @@ -95,85 +81,67 @@ def detect_available_encoders_pyav(encoders: list[str] | str) -> list[str]: return available -def _is_field_supported(field_name: str, vcodec: str, options: dict[str, av.option.Option]) -> bool: - """Whether tuning option *field_name* is meaningful for *vcodec*.""" - # GOP is a stream-level option (AVStream.gop_size) not stored in private options. - # Every video codec accepts it. - if field_name == "g": - return True - if field_name == "crf": - # Semantic "crf" maps to the codec's private option (see - # CRF_OPTION_BY_CODEC), or to stream-level q:v for VideoToolbox. - opt_name = CRF_OPTION_BY_CODEC.get(vcodec) - return (opt_name is not None and opt_name in options) or vcodec in { - "h264_videotoolbox", - "hevc_videotoolbox", - } - if field_name == "fast_decode": - # libsvtav1: svtav1-params:fast-decode=N — h264/hevc: tune=fastdecode. - return "svtav1-params" in options or "tune" in options - # preset and any future private-option-backed field: direct membership test. - return field_name in options - - -def _check_numeric_range(label: str, num: float, opt: av.option.Option, vcodec: str) -> None: - """Raise if *num* lies outside *opt*'s numeric range (no-op if range is degenerate).""" - lo, hi = float(opt.min), float(opt.max) - if lo < hi and not (lo <= num <= hi): - raise ValueError(f"{label}={num} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]") - - -def _validate_option_value(vcodec: str, field_name: str, value: Any, opt: av.option.Option) -> None: - """Range-check numeric *value* and choice-check string *value* against *opt*. - - Type mismatches fall through to FFmpeg's own validation at encode time. - """ +def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Option) -> None: + """Range-check numeric *value* and choice-check string *value* against *opt*.""" type_name = opt.type.name if type_name in FFMPEG_NUMERIC_OPTION_TYPES: - if isinstance(value, bool) or not isinstance(value, (int, float)): - return - _check_numeric_range(field_name, float(value), opt, vcodec) - elif type_name == "STRING": - if not isinstance(value, str): - return - choices = [c.name for c in (opt.choices or [])] - if choices and value not in choices: + if isinstance(value, bool): raise ValueError( - f"{field_name}={value!r} is not a supported choice for codec " + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) + elif isinstance(value, str): + try: + num_val = float(value) + except ValueError as e: + raise ValueError( + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) from e + elif isinstance(value, (float, int)): + num_val = value + else: + raise ValueError( + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) + + # Check integer type compatibility + if type_name in FFMPEG_INTEGER_OPTION_TYPES and not num_val.is_integer(): + raise ValueError( + f"{label}={num_val!r} must be an integer for codec {vcodec!r} " + f"(FFmpeg option {opt.name!r} is {type_name}); float values are not allowed." + ) + + # Check numeric range compatibility + lo, hi = float(opt.min), float(opt.max) + if lo < hi and not (lo <= num_val <= hi): + raise ValueError(f"{label}={num_val} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]") + + elif type_name == "STRING": + if isinstance(value, bool): + raise ValueError( + f"{label}={value!r} is not a valid string value for codec {vcodec!r}." + ) + if isinstance(value, str): + str_val = value + elif isinstance(value, (int, float)): + str_val = str(value) + else: + raise ValueError( + f"{label}={value!r} has unsupported type for STRING option on codec {vcodec!r}" + ) + + # Check string choice compatibility + choices = [c.name for c in (opt.choices or [])] + if choices and str_val not in choices: + raise ValueError( + f"{label}={str_val!r} is not a supported choice for codec " f"{vcodec!r}; valid choices: {choices}" ) else: return -def _validate_extra_option(vcodec: str, key: str, value: Any, opt: av.option.Option) -> None: - """Validate an ``extra_options`` entry: enforce numeric range/type only. - - Non-numeric options are passed through (FFmpeg accepts many ad-hoc strings). - """ - if opt.type.name not in FFMPEG_NUMERIC_OPTION_TYPES: - return - - label = f"extra_options[{key!r}]" - not_numeric = ValueError( - f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." - ) - if isinstance(value, bool): - raise not_numeric - if isinstance(value, (int, float)): - num = float(value) - elif isinstance(value, str): - try: - num = float(value) - except ValueError as e: - raise not_numeric from e - else: - raise not_numeric - - _check_numeric_range(label, num, opt, vcodec) - - -def _check_pixel_format(vcodec: str, pix_fmt: str, formats: tuple[str, ...]) -> None: +def _check_pixel_format(vcodec: str, pix_fmt: str) -> None: + formats = _get_codec_video_formats(vcodec) if formats and pix_fmt not in formats: raise ValueError( f"pix_fmt={pix_fmt!r} is not supported by codec {vcodec!r}; " @@ -181,57 +149,30 @@ def _check_pixel_format(vcodec: str, pix_fmt: str, formats: tuple[str, ...]) -> ) -def _check_tuning_fields( - config: VideoEncoderConfig, vcodec: str, options: dict[str, av.option.Option] +def _check_codec_options( + vcodec: str, codec_options: dict[str, Any], config: VideoEncoderConfig ) -> None: - supported_fields = [f for f in TUNING_FIELDS if _is_field_supported(f, vcodec, options)] - for field_name in TUNING_FIELDS: - value = getattr(config, field_name) - if not value: - continue - if field_name not in supported_fields: - raise ValueError( - f"{field_name}={value!r} is not supported by codec {vcodec!r}; " - f"supported fields for this codec: {supported_fields}" - ) - # ``g`` is stream-level (AVCodecContext.gop_size), not in ``options``. - # Enforce a positive integer value. - if field_name == "g": + """Validate merged encoder options (typed) against the codec's published AVOptions.""" + supported_options = _get_codec_options_by_name(vcodec) + for key, value in codec_options.items(): + # GOP size is not a codec-specific option, it has to be validated separately. + if key == "g": if isinstance(value, bool) or not isinstance(value, int) or value < 1: raise ValueError(f"g={value!r} must be a positive integer for codec {vcodec!r}") continue - # Value shape is only cross-checkable when the field maps directly - # to a private option: ``preset`` is literally ``"preset"``; - # ``crf`` maps per-codec. ``fast_decode`` (composite) falls through - # to FFmpeg at encode time. - if field_name == "preset": - opt = options.get("preset") - elif field_name == "crf": - opt = options.get(CRF_OPTION_BY_CODEC.get(vcodec, "")) - else: + if key not in supported_options: continue - if opt is not None: - _validate_option_value(vcodec, field_name, value, opt) - - -def _check_extra_options( - config: VideoEncoderConfig, vcodec: str, options: dict[str, av.option.Option] -) -> None: - # Torchcodec-style: only validate keys the codec exposes as AVOptions, - # and only enforce numeric range / numeric-type. Everything else is - # passed through (muxer options, ``x264-params``-style strings, etc.). - for key, value in config.extra_options.items(): - opt = options.get(key) - if opt is None: - continue - _validate_extra_option(vcodec, key, value, opt) + opt = supported_options[key] + label = f"extra_options[{key!r}]" if key in config.extra_options else key + _check_option_value(vcodec, label, value, opt) def check_video_encoder_config_pyav(config: VideoEncoderConfig) -> None: """Verify *config* is compatible with the bundled FFmpeg build. - Checks pixel format, tuning-field availability, value range/choices for - fields that map to a private option, and numeric ``extra_options``. + Checks pixel format, abstract tuning-field compatibility, and each merged + encoder option from :meth:`~lerobot.datasets.video_utils.VideoEncoderConfig.get_codec_options` + against PyAV (including numeric ``extra_options`` present in that dict). No-op when ``config.vcodec`` isn't in the local FFmpeg build. Raises: @@ -245,6 +186,5 @@ def check_video_encoder_config_pyav(config: VideoEncoderConfig) -> None: vcodec, ) return - _check_pixel_format(vcodec, config.pix_fmt, _get_codec_video_formats(vcodec)) - _check_tuning_fields(config, vcodec, options) - _check_extra_options(config, vcodec, options) + _check_pixel_format(config.vcodec, config.pix_fmt) + _check_codec_options(config.vcodec, config.get_codec_options(), config) diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index 821c51f26..e3ab200e2 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -73,9 +73,11 @@ class VideoEncoderConfig: codec (``crf`` for software, ``qp`` for NVENC/VAAPI, ``q:v`` for VideoToolbox, ``global_quality`` for QSV). preset: Speed/quality preset. Accepted type is per-codec. - fast_decode: Fast-decode tuning. For libsvtav1 this is a level - (0-4); for h264/hevc any non-zero value enables ``tune=fastdecode``. - video_backend: Python library driving FFmpeg for encoding. Only ``"pyav"`` is currently supported. + fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2) + embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values + set ``tune=fastdecode``. Ignored for other codecs. + video_backend: Python library driving FFmpeg for encoding. Only ``"pyav"`` + is currently supported. extra_options: Free-form dictionary of additional FFmpeg options (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``). """ @@ -137,42 +139,46 @@ class VideoEncoderConfig: return raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}") - def get_codec_options(self, encoder_threads: int | None = None) -> dict[str, str]: + def get_codec_options(self, encoder_threads: int | None = None, as_strings: bool = False) -> dict[str, str]: """Translate the tuning fields to codec-specific FFmpeg options. ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field. Args: - encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs. For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``. For h264/hevc, this is mapped to ``threads``. Hardware encoders ignore this. + encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs. + For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``. + For h264/hevc, this is mapped to ``threads``. + Hardware encoders ignore this parameter. + as_strings: If ``True``, casts values to strings. """ - opts: dict[str, str] = {} + opts: dict[str, Any] = {} def set_if(key: str, value: Any) -> None: if value is not None: - opts[key] = str(value) + opts[key] = value if not as_strings else str(value) + + # GOP size is not a codec-specific option, so it is always set. + set_if("g", self.g) if self.vcodec == "libsvtav1": - set_if("g", self.g) set_if("crf", self.crf) set_if("preset", self.preset) svtav1_parts: list[str] = [] - if self.fast_decode: - svtav1_parts.append(f"fast-decode={self.fast_decode}") + if self.fast_decode is not None: + svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}") if encoder_threads is not None: svtav1_parts.append(f"lp={encoder_threads}") if svtav1_parts: opts["svtav1-params"] = ":".join(svtav1_parts) elif self.vcodec in ("h264", "hevc"): - set_if("g", self.g) set_if("crf", self.crf) set_if("preset", self.preset) if self.fast_decode: opts["tune"] = "fastdecode" set_if("threads", encoder_threads) elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"): - set_if("g", self.g) if self.crf is not None: - opts["q:v"] = str(max(1, min(100, 100 - self.crf * 2))) + opts["q:v"] = max(1, min(100, 100 - self.crf * 2)) elif self.vcodec in ("h264_nvenc", "hevc_nvenc"): opts["rc"] = "constqp" set_if("qp", self.crf) @@ -183,12 +189,13 @@ class VideoEncoderConfig: set_if("global_quality", self.crf) set_if("preset", self.preset) else: - set_if("g", self.g) set_if("crf", self.crf) set_if("preset", self.preset) + # Extra options are merged last but never override structured fields (values are kept as given). for k, v in self.extra_options.items(): - opts.setdefault(k, str(v)) + if k not in opts: + set_if(k, v) return opts @@ -520,7 +527,7 @@ def encode_video_frames( with Image.open(input_list[0]) as dummy_image: width, height = dummy_image.size - video_options = camera_encoder_config.get_codec_options(encoder_threads) + video_options = camera_encoder_config.get_codec_options(encoder_threads, as_strings=True) # Set logging level if log_level is not None: @@ -841,7 +848,9 @@ class StreamingVideoEncoder: video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4" vcodec = self._camera_encoder_config.vcodec - codec_options = self._camera_encoder_config.get_codec_options(self._encoder_threads) + codec_options = self._camera_encoder_config.get_codec_options( + self._encoder_threads, as_strings=True + ) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=self.fps, diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py index 3053bc75e..6dde3a36b 100644 --- a/tests/datasets/test_video_encoding.py +++ b/tests/datasets/test_video_encoding.py @@ -61,29 +61,29 @@ class TestCodecOptions: def test_libsvtav1_defaults(self): cfg = VideoEncoderConfig() opts = cfg.get_codec_options() - assert opts["g"] == "2" - assert opts["crf"] == "30" - assert opts["preset"] == "12" + assert opts["g"] == 2 + assert opts["crf"] == 30 + assert opts["preset"] == 12 @require_libsvtav1 def test_libsvtav1_custom_preset(self): cfg = VideoEncoderConfig(preset=8) - assert cfg.get_codec_options()["preset"] == "8" + assert cfg.get_codec_options()["preset"] == 8 @require_h264 def test_h264_options(self): cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None) opts = cfg.get_codec_options() - assert opts["g"] == "10" - assert opts["crf"] == "23" + assert opts["g"] == 10 + assert opts["crf"] == 23 assert "preset" not in opts @require_videotoolbox def test_videotoolbox_options(self): cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None) opts = cfg.get_codec_options() - assert opts["g"] == "2" - assert opts["q:v"] == "40" + assert opts["g"] == 2 + assert opts["q:v"] == 40 assert "crf" not in opts @_require_encoder("h264_nvenc") @@ -91,19 +91,19 @@ class TestCodecOptions: cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None) opts = cfg.get_codec_options() assert opts["rc"] == "constqp" - assert opts["qp"] == "25" + assert opts["qp"] == 25 assert "crf" not in opts assert "g" not in opts @_require_encoder("h264_vaapi") def test_vaapi_options(self): cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None) - assert cfg.get_codec_options()["qp"] == "28" + assert cfg.get_codec_options()["qp"] == 28 @_require_encoder("h264_qsv") def test_qsv_options(self): cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None) - assert cfg.get_codec_options()["global_quality"] == "25" + assert cfg.get_codec_options()["global_quality"] == 25 @require_h264 def test_no_g_no_crf(self): @@ -121,7 +121,7 @@ class TestCodecOptions: @require_h264 def test_encoder_threads_h264(self): cfg = VideoEncoderConfig(vcodec="h264", preset=None) - assert cfg.get_codec_options(encoder_threads=2)["threads"] == "2" + assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2 @require_libsvtav1 def test_fast_decode_libsvtav1(self): @@ -129,6 +129,14 @@ class TestCodecOptions: opts = cfg.get_codec_options() assert "fast-decode=1" in opts.get("svtav1-params", "") + @require_libsvtav1 + def test_libsvtav1_fast_decode_clamped_to_svt_range(self): + """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode).""" + cfg = VideoEncoderConfig(fast_decode=100) + assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "") + cfg_neg = VideoEncoderConfig(fast_decode=-5) + assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "") + @require_h264 def test_fast_decode_h264(self): cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None) @@ -156,10 +164,10 @@ class TestCodecOptions: assert cfg.get_codec_options()["preset"] == "slow" @require_videotoolbox - def test_preset_on_videotoolbox_raises(self): + def test_preset_on_videotoolbox_not_set(self): """videotoolbox has no preset option at all.""" - with pytest.raises(ValueError, match="preset"): - VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow") + cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow") + assert "preset" not in cfg.get_codec_options() @require_libsvtav1 def test_libsvtav1_preset_out_of_range_raises(self): @@ -175,11 +183,36 @@ class TestCodecOptions: with pytest.raises(ValueError, match="crf.*out of range"): VideoEncoderConfig(vcodec="libsvtav1", crf=64) + @require_libsvtav1 + def test_libsvtav1_crf_rejects_python_float(self): + """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation.""" + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig(vcodec="libsvtav1", crf=2.5) + + @require_libsvtav1 + def test_libsvtav1_extra_crf_rejects_fractional_string(self): + """INT options reject fractional values even when supplied only via ``extra_options``.""" + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig( + vcodec="libsvtav1", + crf=None, + extra_options={"crf": "2.5"}, + ) + + @require_libsvtav1 + def test_libsvtav1_extra_crf_rejects_float(self): + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig( + vcodec="libsvtav1", + crf=None, + extra_options={"crf": 2.5}, + ) + @require_h264 def test_h264_crf_accepts_float_and_int(self): """x264 exposes crf as a FLOAT option, so both int and float are accepted.""" - assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == "23" - assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == "23.5" + assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23 + assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5 @require_libsvtav1 def test_validate_is_rerunnable(self): @@ -189,12 +222,6 @@ class TestCodecOptions: with pytest.raises(ValueError, match="out of range"): cfg.validate() - @require_videotoolbox - def test_fast_decode_on_videotoolbox_raises(self): - """videotoolbox has no `tune` option; fast_decode must not be silently dropped.""" - with pytest.raises(ValueError, match="fast_decode"): - VideoEncoderConfig(vcodec="h264_videotoolbox", preset=None, fast_decode=1) - class TestExtraOptions: @require_libsvtav1 @@ -249,17 +276,18 @@ class TestExtraOptions: @require_libsvtav1 def test_merged_into_codec_options_and_stringified(self): - """extra_options are merged into get_codec_options() as strings.""" + """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict.""" cfg = VideoEncoderConfig(extra_options={"qp": 20}) opts = cfg.get_codec_options() - assert opts["qp"] == "20" - assert isinstance(opts["qp"], str) + assert opts["qp"] == 20 + assert isinstance(opts["qp"], int) + assert cfg.get_codec_options(as_strings=True)["qp"] == "20" @require_libsvtav1 def test_structured_fields_win_on_collision(self): """A colliding extra_options key is discarded; the structured field wins.""" cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18}) - assert cfg.get_codec_options()["crf"] == "30" + assert cfg.get_codec_options()["crf"] == 30 class TestEncoderDetection: