From 6b2d5cec4cb80cc2e75afb95b8dcf1f149dc7568 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 12 Jun 2026 19:17:59 +0200
Subject: [PATCH] test(depth encoding): updating and cleaning video/depth
 encoding tests

---
 tests/datasets/test_video_encoding.py | 352 +++++++++++++++++++++-----
 tests/fixtures/constants.py           |  32 +++
 tests/fixtures/dataset_factories.py   |  13 +-
 3 files changed, 325 insertions(+), 72 deletions(-)

diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py
index ca735b77c..0afccd13f 100644
--- a/tests/datasets/test_video_encoding.py
+++ b/tests/datasets/test_video_encoding.py
@@ -37,7 +37,15 @@ from lerobot.datasets.video_utils import (
     get_video_info,
     reencode_video,
 )
-from tests.fixtures.constants import DUMMY_VIDEO_INFO
+from tests.fixtures.constants import (
+    DUMMY_DEPTH_FEATURES,
+    DUMMY_DEPTH_KEY,
+    DUMMY_DEPTH_VIDEO_INFO_FULL,
+    DUMMY_VIDEO_FEATURES,
+    DUMMY_VIDEO_INFO,
+    DUMMY_VIDEO_KEY,
+)
+from tests.fixtures.dataset_factories import add_frames
 
 
 # Per-codec skip markers — validation tests only fire when the codec is available
@@ -48,12 +56,67 @@ def _require_encoder(vcodec: str) -> pytest.MarkDecorator:
 
 require_libsvtav1 = _require_encoder("libsvtav1")
 require_h264 = _require_encoder("h264")
+require_hevc = _require_encoder("hevc")
 require_videotoolbox = _require_encoder("h264_videotoolbox")
 require_nvenc = _require_encoder("h264_nvenc")
 require_vaapi = _require_encoder("h264_vaapi")
 require_qsv = _require_encoder("h264_qsv")
 
 
+TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"
+
+
+def _write_RGB_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
+    imgs_dir.mkdir(parents=True, exist_ok=True)
+    for i in range(num_frames):
+        arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        write_image(arr, imgs_dir / f"frame-{i:06d}.png")
+
+
+def _write_depth_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
+    """Write synthetic uint16 depth TIFFs (millimetres) for depth encoder tests.
+
+    Uses a smooth linear ramp + per-frame offset (not white noise) so HEVC Main 12
+    on ``gray12le`` compresses well. Values span ~100 mm to 10 m, covering most
+    of the default ``[DEPTH_MIN, DEPTH_MAX]`` metres range after
+    ``quantize_depth(input_unit="auto"="mm")``.
+    """
+    imgs_dir.mkdir(parents=True, exist_ok=True)
+    base = np.linspace(100.0, 10_000.0, height * width, dtype=np.float32).reshape(height, width)
+    for i in range(num_frames):
+        arr = (base + 50.0 * i).clip(0, 65535).astype(np.uint16)
+        write_image(arr, imgs_dir / f"frame-{i:06d}.tiff")
+
+
+def _encode_video(
+    path: Path,
+    num_frames: int = 4,
+    fps: int = 30,
+    cfg: VideoEncoderConfig | None = None,
+    depth: bool = False,
+) -> Path:
+    """Write synthetic frames to a temp dir and encode them to ``path``.
+
+    ``depth=False`` writes uint8 RGB PNG noise and encodes with ``cfg``
+    (defaulting to the library default). ``depth=True`` writes synthetic uint16
+    depth TIFFs and encodes with ``cfg`` or a default :class:`DepthEncoderConfig`
+    (HEVC Main 12 / ``gray12le``).
+    """
+    imgs_dir = path.parent / f"imgs_{path.stem}"
+    if depth:
+        _write_depth_frames(imgs_dir, num_frames=num_frames)
+        cfg = cfg or DepthEncoderConfig()
+    else:
+        _write_RGB_frames(imgs_dir, num_frames=num_frames)
+    encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True)
+    return path
+
+
+def _read_feature_info(dataset: LeRobotDataset, key: str = DUMMY_VIDEO_KEY) -> dict:
+    info = json.loads((dataset.root / INFO_PATH).read_text())
+    return info["features"][key]["info"]
+
+
 # ─── VideoEncoderConfig / codec options ──────────────────────────────
 
 
@@ -87,7 +150,7 @@ class TestCodecOptions:
         assert opts["q:v"] == 40
         assert "crf" not in opts
 
-    @_require_encoder("h264_nvenc")
+    @require_nvenc
     def test_nvenc_options(self):
         cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None)
         opts = cfg.get_codec_options()
@@ -96,12 +159,12 @@ class TestCodecOptions:
         assert "crf" not in opts
         assert opts["g"] == 2
 
-    @_require_encoder("h264_vaapi")
+    @require_vaapi
     def test_vaapi_options(self):
         cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None)
         assert cfg.get_codec_options()["qp"] == 28
 
-    @_require_encoder("h264_qsv")
+    @require_qsv
     def test_qsv_options(self):
         cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None)
         assert cfg.get_codec_options()["global_quality"] == 25
@@ -313,59 +376,6 @@ class TestEncoderDetection:
         assert "h264_nvenc" in VALID_VIDEO_CODECS
 
 
-TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"
-
-# Default video feature set used by persistence tests.
-VIDEO_FEATURES = {
-    "observation.images.cam": {
-        "dtype": "video",
-        "shape": (64, 96, 3),
-        "names": ["height", "width", "channels"],
-    },
-    "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
-}
-VIDEO_KEY = "observation.images.cam"
-
-
-def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
-    imgs_dir.mkdir(parents=True, exist_ok=True)
-    for i in range(num_frames):
-        arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-        write_image(arr, imgs_dir / f"frame-{i:06d}.png")
-
-
-def _encode_video(
-    path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None
-) -> Path:
-    imgs_dir = path.parent / f"imgs_{path.stem}"
-    _write_frames(imgs_dir, num_frames=num_frames)
-    encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True)
-    return path
-
-
-def _read_feature_info(dataset: LeRobotDataset) -> dict:
-    info = json.loads((dataset.root / INFO_PATH).read_text())
-    return info["features"][VIDEO_KEY]["info"]
-
-
-def _add_frames(dataset: LeRobotDataset, num_frames: int, video_keys: list[str] | None = None) -> None:
-    from lerobot.utils.constants import DEFAULT_FEATURES
-
-    if video_keys is None:
-        video_keys = dataset.meta.video_keys
-    for _ in range(num_frames):
-        frame: dict = {"task": "test"}
-        for key, ft in dataset.meta.features.items():
-            if key in DEFAULT_FEATURES:
-                continue
-            shape = ft["shape"]
-            if key in video_keys:
-                frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
-            else:
-                frame[key] = np.zeros(shape, dtype=np.float32)
-        dataset.add_frame(frame)
-
-
 class TestGetVideoInfo:
     def test_returns_all_stream_fields(self):
         info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4")
@@ -439,7 +449,7 @@ class TestEncodeVideoFrames:
 
     def test_overwrite_false_skips_existing_file(self, tmp_path):
         imgs_dir = tmp_path / "imgs"
-        _write_frames(imgs_dir)
+        _write_RGB_frames(imgs_dir)
         video_path = tmp_path / "out.mp4"
         sentinel = b"pre-existing content"
         video_path.write_bytes(sentinel)
@@ -451,7 +461,7 @@ class TestEncodeVideoFrames:
     @require_libsvtav1
     def test_overwrite_true_replaces_existing_file(self, tmp_path):
         imgs_dir = tmp_path / "imgs"
-        _write_frames(imgs_dir)
+        _write_RGB_frames(imgs_dir)
         video_path = tmp_path / "out.mp4"
         video_path.write_bytes(b"stale content")
 
@@ -572,10 +582,10 @@ class TestEncoderConfigPersistence:
     def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
         cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
         dataset = empty_lerobot_dataset_factory(
-            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+            root=tmp_path / "ds", features=DUMMY_VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
         )
 
-        _add_frames(dataset, num_frames=4)
+        add_frames(dataset, num_frames=4)
         dataset.save_episode()
         dataset.finalize()
 
@@ -595,14 +605,14 @@ class TestEncoderConfigPersistence:
     def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory):
         cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
         dataset = empty_lerobot_dataset_factory(
-            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+            root=tmp_path / "ds", features=DUMMY_VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
         )
 
-        _add_frames(dataset, num_frames=4)
+        add_frames(dataset, num_frames=4)
         dataset.save_episode()
         first_info = dict(_read_feature_info(dataset))
 
-        _add_frames(dataset, num_frames=4)
+        add_frames(dataset, num_frames=4)
         dataset.save_episode()
         dataset.finalize()
 
@@ -629,3 +639,217 @@ class TestFromVideoInfo:
         # ``{}`` placeholder (typical after a merge with disagreeing sources)
         # must not leak into the reconstructed config.
         assert cfg.extra_options == VideoEncoderConfig().extra_options
+
+
+# ─── Depth-specific encoding tests ────────────────────────────────────
+
+
+class TestEncodeDepthVideoFrames:
+    """Depth mirror of :class:`TestEncodeVideoFrames`.
+
+    Exercises ``encode_video_frames`` end-to-end through
+    :class:`DepthEncoderConfig` (HEVC Main 12 / ``gray12le``) on synthetic
+    uint16 depth TIFFs.
+    """
+
+    @require_hevc
+    def test_produces_readable_file(self, tmp_path):
+        video_path = _encode_video(tmp_path / "out.mp4", depth=True)
+
+        assert video_path.exists()
+        info = get_video_info(video_path, video_encoder=DepthEncoderConfig())
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.codec"] == "hevc"
+        assert info["video.pix_fmt"] == "gray12le"
+        assert info["video.channels"] == 1
+        assert info["is_depth_map"] is True
+
+    @require_hevc
+    def test_frame_count_and_duration_match_input(self, tmp_path):
+        num_frames = 10
+        fps = 30
+        video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps, depth=True)
+
+        with av.open(str(video_path)) as container:
+            stream = container.streams.video[0]
+            actual_frames = sum(1 for _ in container.decode(stream))
+            duration = (
+                float(stream.duration * stream.time_base)
+                if stream.duration is not None
+                else float(container.duration / av.time_base)
+            )
+
+        assert actual_frames == num_frames
+        assert abs(duration - num_frames / fps) < 0.1
+
+    def test_overwrite_false_skips_existing_file(self, tmp_path):
+        """Codec-agnostic: file-system semantics must hold even without an HEVC encoder."""
+        imgs_dir = tmp_path / "imgs"
+        _write_depth_frames(imgs_dir)
+        video_path = tmp_path / "out.mp4"
+        sentinel = b"pre-existing depth content"
+        video_path.write_bytes(sentinel)
+
+        encode_video_frames(imgs_dir, video_path, fps=30, video_encoder=DepthEncoderConfig(), overwrite=False)
+
+        assert video_path.read_bytes() == sentinel
+
+    @require_hevc
+    def test_overwrite_true_replaces_existing_file(self, tmp_path):
+        imgs_dir = tmp_path / "imgs"
+        _write_depth_frames(imgs_dir)
+        video_path = tmp_path / "out.mp4"
+        video_path.write_bytes(b"stale content")
+
+        encode_video_frames(imgs_dir, video_path, fps=30, video_encoder=DepthEncoderConfig(), overwrite=True)
+
+        info = get_video_info(video_path, video_encoder=DepthEncoderConfig())
+        assert info["video.height"] == 64
+        assert info["video.pix_fmt"] == "gray12le"
+        assert info["is_depth_map"] is True
+
+    @require_hevc
+    def test_custom_encoder_config_fields_stored_in_info(self, tmp_path):
+        """All stream-derived and depth-encoder config fields are present after encoding."""
+        cfg = DepthEncoderConfig(
+            vcodec="hevc",
+            pix_fmt="gray12le",
+            g=4,
+            crf=25,
+            depth_min=0.05,
+            depth_max=8.0,
+            shift=2.5,
+            use_log=False,
+        )
+        video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg, depth=True)
+
+        info = get_video_info(video_path, video_encoder=cfg)
+
+        # Stream-derived
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.channels"] == 1
+        assert info["video.codec"] == "hevc"
+        assert info["video.pix_fmt"] == "gray12le"
+        assert info["video.fps"] == 30
+        assert info["is_depth_map"] is True
+        assert info["has_audio"] is False
+        # Base encoder config
+        assert info["video.g"] == 4
+        assert info["video.crf"] == 25
+        assert info["video.fast_decode"] == 0
+        assert info["video.video_backend"] == "pyav"
+        assert info["video.extra_options"] == {}
+        # Depth-specific tuning
+        assert info["video.depth_min"] == 0.05
+        assert info["video.depth_max"] == 8.0
+        assert info["video.shift"] == 2.5
+        assert info["video.use_log"] is False
+
+
+class TestDepthEncoderConfigPersistence:
+    """Depth mirror of :class:`TestEncoderConfigPersistence`.
+
+    ``DepthEncoderConfig`` must be stored as ``video.<field>`` entries
+    (including the depth-specific ``depth_min`` / ``depth_max`` / ``shift`` /
+    ``use_log``) under ``info["features"][<depth_key>]["info"]`` when the
+    first episode is saved.
+    """
+
+    @require_hevc
+    def test_first_episode_save_persists_depth_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
+        cfg = DepthEncoderConfig(
+            vcodec="hevc",
+            pix_fmt="gray12le",
+            g=2,
+            crf=30,
+            depth_min=0.05,
+            depth_max=8.0,
+            shift=2.5,
+            use_log=False,
+        )
+        dataset = empty_lerobot_dataset_factory(
+            root=tmp_path / "ds", features=DUMMY_DEPTH_FEATURES, use_videos=True, depth_encoder=cfg
+        )
+
+        add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        dataset.finalize()
+
+        info = _read_feature_info(dataset, key=DUMMY_DEPTH_KEY)
+
+        # Stream-derived
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.fps"] == 30
+        assert info["video.codec"] == "hevc"
+        assert info["video.pix_fmt"] == "gray12le"
+        assert info["is_depth_map"] is True
+        # Base encoder config
+        assert info["video.g"] == 2
+        assert info["video.crf"] == 30
+        assert info["video.fast_decode"] == 0
+        assert info["video.video_backend"] == "pyav"
+        assert info["video.extra_options"] == {}
+        # Depth-specific tuning
+        assert info["video.depth_min"] == 0.05
+        assert info["video.depth_max"] == 8.0
+        assert info["video.shift"] == 2.5
+        assert info["video.use_log"] is False
+
+    @require_hevc
+    def test_second_episode_does_not_overwrite_depth_encoder_fields(
+        self, tmp_path, empty_lerobot_dataset_factory
+    ):
+        cfg = DepthEncoderConfig(
+            vcodec="hevc",
+            pix_fmt="gray12le",
+            g=2,
+            crf=30,
+            depth_min=0.05,
+            depth_max=8.0,
+            shift=2.5,
+            use_log=False,
+        )
+        dataset = empty_lerobot_dataset_factory(
+            root=tmp_path / "ds", features=DUMMY_DEPTH_FEATURES, use_videos=True, depth_encoder=cfg
+        )
+
+        add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        first_info = dict(_read_feature_info(dataset, key=DUMMY_DEPTH_KEY))
+
+        add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        dataset.finalize()
+
+        assert _read_feature_info(dataset, key=DUMMY_DEPTH_KEY) == first_info
+
+
+class TestDepthFromVideoInfo:
+    """``DepthEncoderConfig.from_video_info`` reconstructs a depth encoder
+    config from the ``video.*`` keys persisted in a dataset's ``info.json``.
+
+    Depth mirror of :class:`TestFromVideoInfo`.
+    """
+
+    @require_hevc
+    def test_reconstructs_from_dummy_depth_video_info(self):
+        cfg = DepthEncoderConfig.from_video_info(DUMMY_DEPTH_VIDEO_INFO_FULL)
+
+        # No alias for ``"hevc"``; the canonical stream codec is reused as-is.
+        assert cfg.vcodec == "hevc"
+        assert cfg.pix_fmt == DUMMY_DEPTH_VIDEO_INFO_FULL["video.pix_fmt"]
+        assert cfg.g == DUMMY_DEPTH_VIDEO_INFO_FULL["video.g"]
+        assert cfg.crf == DUMMY_DEPTH_VIDEO_INFO_FULL["video.crf"]
+        assert cfg.fast_decode == DUMMY_DEPTH_VIDEO_INFO_FULL["video.fast_decode"]
+        assert cfg.video_backend == DUMMY_DEPTH_VIDEO_INFO_FULL["video.video_backend"]
+        # ``{}`` placeholder (typical after a merge with disagreeing sources)
+        # must not leak into the reconstructed config.
+        assert cfg.extra_options == DepthEncoderConfig().extra_options
+        # Depth-specific tuning round-trips through ``info.json``.
+        assert cfg.depth_min == DUMMY_DEPTH_VIDEO_INFO_FULL["video.depth_min"]
+        assert cfg.depth_max == DUMMY_DEPTH_VIDEO_INFO_FULL["video.depth_max"]
+        assert cfg.shift == DUMMY_DEPTH_VIDEO_INFO_FULL["video.shift"]
+        assert cfg.use_log == DUMMY_DEPTH_VIDEO_INFO_FULL["video.use_log"]
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index 816deef32..d6f4f8ae5 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -50,6 +50,16 @@ DUMMY_DEPTH_VIDEO_INFO = {
     **DUMMY_VIDEO_INFO,
     "is_depth_map": True,
 }
+DUMMY_DEPTH_VIDEO_INFO_FULL = {
+    **{k: v for k, v in DUMMY_VIDEO_INFO.items() if k != "video.preset"},
+    "video.codec": "hevc",
+    "video.pix_fmt": "gray12le",
+    "is_depth_map": True,
+    "video.depth_min": 0.05,
+    "video.depth_max": 8.0,
+    "video.shift": 2.5,
+    "video.use_log": True,
+}
 DUMMY_DEPTH_CAMERA_FEATURES = {
     "laptop_depth": {
         "shape": (64, 96, 1),
@@ -60,3 +70,25 @@ DUMMY_DEPTH_CAMERA_FEATURES = {
 DUMMY_CAMERA_FEATURES_WITH_DEPTH = {**DUMMY_CAMERA_FEATURES, **DUMMY_DEPTH_CAMERA_FEATURES}
 DUMMY_CHW = (3, 96, 128)
 DUMMY_HWC = (96, 128, 3)
+
+# Default video feature set used by video-encoding persistence tests.
+DUMMY_VIDEO_FEATURES = {
+    "observation.images.cam": {
+        "dtype": "video",
+        "shape": (64, 96, 3),
+        "names": ["height", "width", "channels"],
+    },
+    "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
+}
+DUMMY_VIDEO_KEY = "observation.images.cam"
+
+DUMMY_DEPTH_FEATURES = {
+    "observation.images.depth": {
+        "dtype": "video",
+        "shape": (64, 96, 1),
+        "names": ["height", "width", "channels"],
+        "info": {"is_depth_map": True},
+    },
+    "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
+}
+DUMMY_DEPTH_KEY = "observation.images.depth"
diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py
index dec569db2..e85a99fc0 100644
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -38,6 +38,8 @@ from lerobot.datasets.utils import (
     DEFAULT_VIDEO_PATH,
     DatasetInfo,
 )
+from lerobot.datasets.video_utils import encode_video_frames
+from lerobot.utils.constants import DEFAULT_FEATURES
 from tests.fixtures.constants import (
     DEFAULT_FPS,
     DUMMY_CAMERA_FEATURES,
@@ -45,13 +47,9 @@ from tests.fixtures.constants import (
     DUMMY_REPO_ID,
     DUMMY_ROBOT_TYPE,
 )
-from lerobot.datasets.video_utils import encode_video_frames
-from lerobot.utils.constants import DEFAULT_FEATURES
 
 
-def add_frames(
-    dataset: LeRobotDataset, num_frames: int
-) -> None:
+def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
     """Append ``num_frames`` synthetic frames to ``dataset``.
 
     Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for
@@ -59,9 +57,8 @@ def add_frames(
     and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp,
     frame_index, ...) are auto-populated by ``add_frame`` and skipped here.
     """
-    if video_keys is None:
-        video_keys = dataset.meta.video_keys
-    depth_keys = set(dataset.meta.depth_keys)
+    video_keys = dataset.meta.video_keys
+    depth_keys = dataset.meta.depth_keys
     # Smooth gradient base reused per (H, W) to keep depth frames cheap to
     # encode (HEVC Main 12 hates white noise).
     _depth_base_cache: dict[tuple[int, int], np.ndarray] = {}