lerobot/tests/datasets/test_video_encoding.py

#!/usr/bin/env python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Unit tests for ``lerobot.datasets.video_utils`` encoding functions and ``VideoEncoderConfig`` config class."""

import json
from pathlib import Path

import numpy as np
import pytest

pytest.importorskip("av", reason="av is required (install lerobot[dataset])")

import av  # noqa: E402

from lerobot.datasets.image_writer import write_image
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.pyav_utils import get_codec
from lerobot.datasets.utils import INFO_PATH
from lerobot.datasets.video_utils import (
    VALID_VIDEO_CODECS,
    VideoEncoderConfig,
    concatenate_video_files,
    encode_video_frames,
    get_video_info,
)


# Per-codec skip markers — validation tests only fire when the codec is available
def _require_encoder(vcodec: str) -> pytest.MarkDecorator:
    """Skip the test if ``vcodec`` is not available in the local FFmpeg build."""
    return pytest.mark.skipif(get_codec(vcodec) is None, reason=f"{vcodec!r} not in local FFmpeg build")


require_libsvtav1 = _require_encoder("libsvtav1")
require_h264 = _require_encoder("h264")
require_videotoolbox = _require_encoder("h264_videotoolbox")
require_nvenc = _require_encoder("h264_nvenc")
require_vaapi = _require_encoder("h264_vaapi")
require_qsv = _require_encoder("h264_qsv")


# ─── VideoEncoderConfig / codec options ──────────────────────────────


class TestCodecOptions:
    @require_libsvtav1
    def test_libsvtav1_defaults(self):
        cfg = VideoEncoderConfig()
        opts = cfg.get_codec_options()
        assert opts["g"] == 2
        assert opts["crf"] == 30
        assert opts["preset"] == 12

    @require_libsvtav1
    def test_libsvtav1_custom_preset(self):
        cfg = VideoEncoderConfig(preset=8)
        assert cfg.get_codec_options()["preset"] == 8

    @require_h264
    def test_h264_options(self):
        cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None)
        opts = cfg.get_codec_options()
        assert opts["g"] == 10
        assert opts["crf"] == 23
        assert "preset" not in opts

    @require_videotoolbox
    def test_videotoolbox_options(self):
        cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None)
        opts = cfg.get_codec_options()
        assert opts["g"] == 2
        assert opts["q:v"] == 40
        assert "crf" not in opts

    @_require_encoder("h264_nvenc")
    def test_nvenc_options(self):
        cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None)
        opts = cfg.get_codec_options()
        assert opts["rc"] == "constqp"
        assert opts["qp"] == 25
        assert "crf" not in opts
        assert "g" not in opts

    @_require_encoder("h264_vaapi")
    def test_vaapi_options(self):
        cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None)
        assert cfg.get_codec_options()["qp"] == 28

    @_require_encoder("h264_qsv")
    def test_qsv_options(self):
        cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None)
        assert cfg.get_codec_options()["global_quality"] == 25

    @require_h264
    def test_no_g_no_crf(self):
        cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None)
        opts = cfg.get_codec_options()
        assert "g" not in opts
        assert "crf" not in opts

    @require_libsvtav1
    def test_encoder_threads_libsvtav1(self):
        cfg = VideoEncoderConfig(fast_decode=0)
        opts = cfg.get_codec_options(encoder_threads=4)
        assert "lp=4" in opts.get("svtav1-params", "")

    @require_h264
    def test_encoder_threads_h264(self):
        cfg = VideoEncoderConfig(vcodec="h264", preset=None)
        assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2

    @require_libsvtav1
    def test_fast_decode_libsvtav1(self):
        cfg = VideoEncoderConfig(fast_decode=1)
        opts = cfg.get_codec_options()
        assert "fast-decode=1" in opts.get("svtav1-params", "")

    @require_libsvtav1
    def test_libsvtav1_fast_decode_clamped_to_svt_range(self):
        """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode)."""
        cfg = VideoEncoderConfig(fast_decode=100)
        assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "")
        cfg_neg = VideoEncoderConfig(fast_decode=-5)
        assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "")

    @require_h264
    def test_fast_decode_h264(self):
        cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None)
        assert cfg.get_codec_options()["tune"] == "fastdecode"

    @require_libsvtav1
    def test_pix_fmt_unsupported_raises(self):
        """Passing an unsupported pix_fmt is a hard error."""
        with pytest.raises(ValueError, match="pix_fmt"):
            VideoEncoderConfig(pix_fmt="yuv444p")  # libsvtav1 only supports yuv420p variants

    @require_libsvtav1
    @require_h264
    def test_preset_default_behaviour(self):
        """Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None."""
        assert VideoEncoderConfig().preset == 12
        assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12
        assert VideoEncoderConfig(vcodec="h264").preset is None
        assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None

    @require_h264
    def test_preset_string_on_h264(self):
        """h264 accepts string presets and forwards them to FFmpeg."""
        cfg = VideoEncoderConfig(vcodec="h264", preset="slow")
        assert cfg.get_codec_options()["preset"] == "slow"

    @require_videotoolbox
    def test_preset_on_videotoolbox_not_set(self):
        """videotoolbox has no preset option at all."""
        cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow")
        assert "preset" not in cfg.get_codec_options()

    @require_libsvtav1
    def test_libsvtav1_preset_out_of_range_raises(self):
        """libsvtav1 preset must sit in [-2, 13] as exposed by PyAV."""
        with pytest.raises(ValueError, match="out of range"):
            VideoEncoderConfig(vcodec="libsvtav1", preset=100)
        with pytest.raises(ValueError, match="out of range"):
            VideoEncoderConfig(vcodec="libsvtav1", preset=-3)

    @require_libsvtav1
    def test_libsvtav1_crf_out_of_range_raises(self):
        """libsvtav1 crf must sit in [0, 63]."""
        with pytest.raises(ValueError, match="crf.*out of range"):
            VideoEncoderConfig(vcodec="libsvtav1", crf=64)

    @require_libsvtav1
    def test_libsvtav1_crf_rejects_python_float(self):
        """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation."""
        with pytest.raises(ValueError, match="float values are not allowed"):
            VideoEncoderConfig(vcodec="libsvtav1", crf=2.5)

    @require_libsvtav1
    def test_libsvtav1_extra_crf_rejects_fractional_string(self):
        """INT options reject fractional values even when supplied only via ``extra_options``."""
        with pytest.raises(ValueError, match="float values are not allowed"):
            VideoEncoderConfig(
                vcodec="libsvtav1",
                crf=None,
                extra_options={"crf": "2.5"},
            )

    @require_libsvtav1
    def test_libsvtav1_extra_crf_rejects_float(self):
        with pytest.raises(ValueError, match="float values are not allowed"):
            VideoEncoderConfig(
                vcodec="libsvtav1",
                crf=None,
                extra_options={"crf": 2.5},
            )

    @require_h264
    def test_h264_crf_accepts_float_and_int(self):
        """x264 exposes crf as a FLOAT option, so both int and float are accepted."""
        assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23
        assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5

    @require_libsvtav1
    def test_validate_is_rerunnable(self):
        """After mutating a field, validate() re-checks and surfaces new issues."""
        cfg = VideoEncoderConfig(vcodec="libsvtav1")
        cfg.preset = 100  # now out of range
        with pytest.raises(ValueError, match="out of range"):
            cfg.validate()


class TestExtraOptions:
    @require_libsvtav1
    def test_default_is_empty_dict(self):
        cfg = VideoEncoderConfig()
        assert cfg.extra_options == {}

    @require_libsvtav1
    def test_unknown_key_passes_through(self):
        """Keys not published as AVOptions are forwarded to FFmpeg."""
        cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"})
        assert cfg.extra_options == {"totally_made_up_option": "value"}

    @require_libsvtav1
    def test_numeric_value_in_range_ok(self):
        """libsvtav1 exposes ``qp`` as INT in [0, 63]."""
        cfg = VideoEncoderConfig(extra_options={"qp": 30})
        assert cfg.extra_options == {"qp": 30}

    @require_libsvtav1
    def test_numeric_out_of_range_raises(self):
        with pytest.raises(ValueError, match=r"extra_options\['qp'\].*out of range"):
            VideoEncoderConfig(extra_options={"qp": 999})

    @require_libsvtav1
    def test_numeric_string_accepted_in_range(self):
        """Numeric strings are accepted for numeric options (mirrors FFmpeg)."""
        cfg = VideoEncoderConfig(extra_options={"qp": "18"})
        assert cfg.extra_options == {"qp": "18"}

    @require_libsvtav1
    def test_numeric_string_out_of_range_raises(self):
        with pytest.raises(ValueError, match=r"extra_options\['qp'\].*out of range"):
            VideoEncoderConfig(extra_options={"qp": "999"})

    @require_libsvtav1
    def test_non_numeric_string_on_numeric_option_raises(self):
        with pytest.raises(ValueError, match=r"extra_options\['qp'\].*not numeric"):
            VideoEncoderConfig(extra_options={"qp": "medium"})

    @require_libsvtav1
    def test_bool_on_numeric_option_raises(self):
        """``bool`` is explicitly rejected for numeric options."""
        with pytest.raises(ValueError, match=r"extra_options\['qp'\].*not numeric"):
            VideoEncoderConfig(extra_options={"qp": True})

    @require_h264
    def test_string_option_passes_through_unchecked(self):
        """String-typed AVOptions are NOT enum-checked (too many accept freeform)."""
        cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"})
        assert cfg.extra_options == {"tune": "some-future-tune"}

    @require_libsvtav1
    def test_merged_into_codec_options_and_stringified(self):
        """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict."""
        cfg = VideoEncoderConfig(extra_options={"qp": 20})
        opts = cfg.get_codec_options()
        assert opts["qp"] == 20
        assert isinstance(opts["qp"], int)
        assert cfg.get_codec_options(as_strings=True)["qp"] == "20"

    @require_libsvtav1
    def test_structured_fields_win_on_collision(self):
        """A colliding extra_options key is discarded; the structured field wins."""
        cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18})
        assert cfg.get_codec_options()["crf"] == 30


class TestEncoderDetection:
    @require_h264
    def test_explicit_codec_kept_when_available(self):
        cfg = VideoEncoderConfig(vcodec="h264")
        assert cfg.vcodec == "h264"

    @require_videotoolbox
    def test_auto_picks_videotoolbox_when_available(self):
        """``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present."""
        cfg = VideoEncoderConfig(vcodec="auto")
        assert cfg.vcodec == "h264_videotoolbox"

    def test_invalid_codec_raises(self):
        with pytest.raises(ValueError, match="Invalid vcodec"):
            VideoEncoderConfig(vcodec="not_a_real_codec")

    def test_hw_encoder_names_listed_as_valid(self):
        assert "auto" in VALID_VIDEO_CODECS
        assert "h264_videotoolbox" in VALID_VIDEO_CODECS
        assert "h264_nvenc" in VALID_VIDEO_CODECS


TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"

# Default video feature set used by persistence tests.
VIDEO_FEATURES = {
    "observation.images.cam": {
        "dtype": "video",
        "shape": (64, 96, 3),
        "names": ["height", "width", "channels"],
    },
    "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
}
VIDEO_KEY = "observation.images.cam"


def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
    imgs_dir.mkdir(parents=True, exist_ok=True)
    for i in range(num_frames):
        arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
        write_image(arr, imgs_dir / f"frame-{i:06d}.png")


def _encode_video(
    path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None
) -> Path:
    imgs_dir = path.parent / f"imgs_{path.stem}"
    _write_frames(imgs_dir, num_frames=num_frames)
    encode_video_frames(imgs_dir, path, fps=fps, camera_encoder_config=cfg, overwrite=True)
    return path


def _read_feature_info(dataset: LeRobotDataset) -> dict:
    info = json.loads((dataset.root / INFO_PATH).read_text())
    return info["features"][VIDEO_KEY]["info"]


def _add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
    shape = dataset.meta.features[VIDEO_KEY]["shape"]
    for _ in range(num_frames):
        dataset.add_frame(
            {
                VIDEO_KEY: np.random.randint(0, 256, shape, dtype=np.uint8),
                "action": np.zeros(2, dtype=np.float32),
                "task": "test",
            }
        )


class TestGetVideoInfo:
    def test_returns_all_stream_fields(self):
        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4")

        assert info["video.height"] == 64
        assert info["video.width"] == 96
        assert info["video.pix_fmt"] == "yuv420p"
        assert info["video.fps"] == 30
        assert info["video.channels"] == 3
        assert info["video.is_depth_map"] is False
        assert info["has_audio"] is False
        assert "video.g" not in info
        assert "video.crf" not in info
        assert "video.preset" not in info

    @require_libsvtav1
    def test_merges_encoder_config_as_video_prefixed_entries(self):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)

        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder_config=cfg)

        assert info["video.g"] == 2
        assert info["video.crf"] == 30
        assert info["video.preset"] == 12
        assert info["video.fast_decode"] == 0
        assert info["video.video_backend"] == "pyav"
        assert info["video.extra_options"] == {}

    @require_libsvtav1
    def test_stream_derived_keys_take_precedence_over_config(self):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p")

        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder_config=cfg)

        assert info["video.codec"]  # populated from stream, not from config's vcodec
        assert info["video.pix_fmt"] == "yuv420p"


class TestEncodeVideoFrames:
    @require_libsvtav1
    def test_produces_readable_mp4(self, tmp_path):
        video_path = _encode_video(tmp_path / "out.mp4")

        assert video_path.exists()
        info = get_video_info(video_path)
        assert info["video.height"] == 64
        assert info["video.width"] == 96

    @require_libsvtav1
    def test_frame_count_and_duration_match_input(self, tmp_path):
        num_frames = 10
        fps = 30
        video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps)

        with av.open(str(video_path)) as container:
            stream = container.streams.video[0]
            actual_frames = sum(1 for _ in container.decode(stream))
            duration = (
                float(stream.duration * stream.time_base)
                if stream.duration is not None
                else float(container.duration / av.time_base)
            )

        assert actual_frames == num_frames
        assert abs(duration - num_frames / fps) < 0.1

    def test_overwrite_false_skips_existing_file(self, tmp_path):
        imgs_dir = tmp_path / "imgs"
        _write_frames(imgs_dir)
        video_path = tmp_path / "out.mp4"
        sentinel = b"pre-existing content"
        video_path.write_bytes(sentinel)

        encode_video_frames(imgs_dir, video_path, fps=30, overwrite=False)

        assert video_path.read_bytes() == sentinel

    @require_libsvtav1
    def test_overwrite_true_replaces_existing_file(self, tmp_path):
        imgs_dir = tmp_path / "imgs"
        _write_frames(imgs_dir)
        video_path = tmp_path / "out.mp4"
        video_path.write_bytes(b"stale content")

        encode_video_frames(imgs_dir, video_path, fps=30, overwrite=True)

        info = get_video_info(video_path)
        assert info["video.height"] == 64

    @require_libsvtav1
    def test_custom_encoder_config_fields_stored_in_info(self, tmp_path):
        """All stream-derived and encoder config fields are present after encoding."""
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10)
        video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg)

        info = get_video_info(video_path, camera_encoder_config=cfg)

        # Stream-derived
        assert info["video.height"] == 64
        assert info["video.width"] == 96
        assert info["video.channels"] == 3
        assert info["video.codec"] == "av1"
        assert info["video.pix_fmt"] == "yuv420p"
        assert info["video.fps"] == 30
        assert info["video.is_depth_map"] is False
        assert info["has_audio"] is False
        # Encoder config
        assert info["video.g"] == 4
        assert info["video.crf"] == 25
        assert info["video.preset"] == 10
        assert info["video.fast_decode"] == 0
        assert info["video.video_backend"] == "pyav"
        assert info["video.extra_options"] == {}


class TestConcatenateVideoFiles:
    def test_two_clips_frame_count(self, tmp_path):
        """Output frame count equals the sum of the two input frame counts."""
        out = tmp_path / "out.mp4"
        concatenate_video_files([TEST_ARTIFACTS_DIR / "clip_6frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out)

        with av.open(str(out)) as container:
            total = sum(1 for _ in container.decode(video=0))
        assert total == 10

    def test_three_clips_frame_count(self, tmp_path):
        out = tmp_path / "out.mp4"
        clip = TEST_ARTIFACTS_DIR / "clip_5frames.mp4"
        concatenate_video_files([clip, clip, clip], out)

        with av.open(str(out)) as container:
            total = sum(1 for _ in container.decode(video=0))
        assert total == 15

    @require_libsvtav1
    def test_geometry_preserved(self, tmp_path):
        """Output resolution, fps, codec and pixel format must match the inputs."""
        out = tmp_path / "out.mp4"
        concatenate_video_files([TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out)

        info = get_video_info(out)
        assert info["video.height"] == 64
        assert info["video.width"] == 96
        assert info["video.fps"] == 30
        assert info["video.codec"] == "av1"
        assert info["video.pix_fmt"] == "yuv420p"

    def test_compatibility_check_raises_on_different_codec(self, tmp_path):
        with pytest.raises(ValueError):
            concatenate_video_files(
                [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_h264.mp4"],
                tmp_path / "out.mp4",
                compatibility_check=True,
            )

    def test_compatibility_check_raises_on_different_resolution(self, tmp_path):
        with pytest.raises(ValueError):
            concatenate_video_files(
                [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_32x48.mp4"],
                tmp_path / "out.mp4",
                compatibility_check=True,
            )


class TestEncoderConfigPersistence:
    """Encoder config must be stored as ``video.<field>`` entries in
    ``info["features"][key]["info"]`` when the first episode is saved.
    """

    @require_libsvtav1
    def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
        dataset = empty_lerobot_dataset_factory(
            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder_config=cfg
        )

        _add_frames(dataset, num_frames=4)
        dataset.save_episode()
        dataset.finalize()

        info = _read_feature_info(dataset)

        assert info["video.height"] == 64
        assert info["video.width"] == 96
        assert info["video.fps"] == 30
        assert info["video.g"] == 2
        assert info["video.crf"] == 30
        assert info["video.preset"] == 12
        assert info["video.fast_decode"] == 0
        assert info["video.video_backend"] == "pyav"
        assert info["video.extra_options"] == {}

    @require_libsvtav1
    def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
        dataset = empty_lerobot_dataset_factory(
            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder_config=cfg
        )

        _add_frames(dataset, num_frames=4)
        dataset.save_episode()
        first_info = dict(_read_feature_info(dataset))

        _add_frames(dataset, num_frames=4)
        dataset.save_episode()
        dataset.finalize()

        assert _read_feature_info(dataset) == first_info