From a694e327740e369340fcc3e2b2a7484fd2b8e644 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Mon, 15 Jun 2026 14:31:42 +0200
Subject: [PATCH] feat(dataset tools): adding missing docstrings and features
 for depth fill support in dataset edition tools

---
 src/lerobot/datasets/dataset_tools.py       | 29 ++++++-------
 src/lerobot/scripts/lerobot_edit_dataset.py | 33 ++++++++++++++-
 tests/datasets/test_dataset_tools.py        | 26 +++++++++++-
 tests/scripts/test_edit_dataset_parsing.py  | 45 +++++++++++++++++++++
 4 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py
index c64c4c29c..5170c0137 100644
--- a/src/lerobot/datasets/dataset_tools.py
+++ b/src/lerobot/datasets/dataset_tools.py
@@ -608,7 +608,7 @@ def _keep_episodes_from_video_with_av(
     output_path: Path,
     episodes_to_keep: list[tuple[int, int]],
     fps: float,
-    camera_encoder: VideoEncoderConfig,
+    video_encoder: VideoEncoderConfig,
 ) -> None:
     """Keep only specified episodes from a video file using PyAV.
 
@@ -622,7 +622,7 @@ def _keep_episodes_from_video_with_av(
             Ranges are half-open intervals: [start_frame, end_frame), where start_frame
             is inclusive and end_frame is exclusive.
         fps: Frame rate of the video.
-        camera_encoder: Video encoder settings used to re-encode the kept frames.
+        video_encoder: Video encoder settings used to re-encode the kept frames.
     """
     from fractions import Fraction
 
@@ -647,13 +647,13 @@ def _keep_episodes_from_video_with_av(
 
     # Convert fps to Fraction for PyAV compatibility.
     fps_fraction = Fraction(fps).limit_denominator(1000)
-    codec_options = camera_encoder.get_codec_options(as_strings=True)
-    v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
+    codec_options = video_encoder.get_codec_options(as_strings=True)
+    v_out = out.add_stream(video_encoder.vcodec, rate=fps_fraction, options=codec_options)
 
     # PyAV type stubs don't distinguish video streams from audio/subtitle streams.
     v_out.width = v_in.codec_context.width
     v_out.height = v_in.codec_context.height
-    v_out.pix_fmt = camera_encoder.pix_fmt
+    v_out.pix_fmt = video_encoder.pix_fmt
 
     # Set time_base to match the frame rate for proper timestamp handling.
     v_out.time_base = Fraction(1, int(fps))
@@ -1670,7 +1670,7 @@ def convert_image_to_video_dataset(
     output_dir: Path | None = None,
     repo_id: str | None = None,
     camera_encoder: VideoEncoderConfig | None = None,
-    depth_encoder: VideoEncoderConfig | None = None,
+    depth_encoder: DepthEncoderConfig | None = None,
     episode_indices: list[int] | None = None,
     num_workers: int = 4,
     max_episodes_per_batch: int | None = None,
@@ -1685,8 +1685,11 @@ def convert_image_to_video_dataset(
         dataset: The source LeRobot dataset with images
         output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
         repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
-        camera_encoder: Video encoder settings
+        camera_encoder: Video encoder settings applied to RGB cameras
             (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
+        depth_encoder: Video encoder settings applied to depth-map cameras, including
+            the quantization parameters persisted to the dataset metadata
+            (``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`).
         episode_indices: List of episode indices to convert (None = all episodes)
         num_workers: Number of threads for parallel processing (default: 4)
         max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1879,13 +1882,11 @@ def convert_image_to_video_dataset(
         # Update video info for all image keys (now videos)
         # We need to manually set video info since update_video_info() checks video_keys first
         for img_key in img_keys:
-            if not new_meta.features[img_key].get("info", None):
-                video_path = new_meta.root / new_meta.video_path.format(
-                    video_key=img_key, chunk_index=0, file_index=0
-                )
-                new_meta.info.features[img_key]["info"] = get_video_info(
-                    video_path, video_encoder=camera_encoder
-                )
+            target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else camera_encoder
+            video_path = new_meta.root / new_meta.video_path.format(
+                video_key=img_key, chunk_index=0, file_index=0
+            )
+            new_meta.info.features[img_key]["info"] = get_video_info(video_path, video_encoder=target_encoder)
 
         write_info(new_meta.info, new_meta.root)
 
diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py
index eaadf47de..ab0be4a41 100644
--- a/src/lerobot/scripts/lerobot_edit_dataset.py
+++ b/src/lerobot/scripts/lerobot_edit_dataset.py
@@ -133,6 +133,15 @@ Convert image dataset to video format and save locally:
         --new_root /path/to/output/pusht_video \
         --operation.type convert_image_to_video
 
+Convert image dataset (with depth maps) to video format, customizing the depth encoder:
+    lerobot-edit-dataset \
+        --repo_id lerobot/pusht_image \
+        --new_root /path/to/output/pusht_video \
+        --operation.type convert_image_to_video \
+        --operation.depth_encoder.depth_min 0.01 \
+        --operation.depth_encoder.depth_max 10.0 \
+        --operation.depth_encoder.use_log true
+
 Convert image dataset to video format and save with new repo_id:
     lerobot-edit-dataset \
         --repo_id lerobot/pusht_image \
@@ -211,6 +220,13 @@ Re-encode videos in-place (overwrites original dataset):
         --operation.camera_encoder.vcodec h264 \
         --operation.overwrite true
 
+Re-encode both RGB and depth videos in a dataset (depth quantization params are preserved):
+    lerobot-edit-dataset \
+        --repo_id lerobot/pusht_depth \
+        --operation.type reencode_videos \
+        --operation.camera_encoder.vcodec libx264 \
+        --operation.depth_encoder.vcodec ffv1
+
 Using JSON config file:
     lerobot-edit-dataset \
         --config_path path/to/edit_config.json
@@ -225,7 +241,13 @@ from pathlib import Path
 
 import draccus
 
-from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser
+from lerobot.configs import (
+    DepthEncoderConfig,
+    VideoEncoderConfig,
+    camera_encoder_defaults,
+    depth_encoder_defaults,
+    parser,
+)
 from lerobot.datasets import (
     LeRobotDataset,
     convert_image_to_video_dataset,
@@ -288,6 +310,7 @@ class ModifyTasksConfig(OperationConfig):
 class ConvertImageToVideoConfig(OperationConfig):
     output_dir: str | None = None
     camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
+    depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
     episode_indices: list[int] | None = None
     num_workers: int = 4
     max_episodes_per_batch: int | None = None
@@ -309,6 +332,7 @@ class RecomputeStatsConfig(OperationConfig):
 @dataclass
 class ReencodeVideosConfig(OperationConfig):
     camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
+    depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
     num_workers: int = 0
     encoder_threads: int | None = None
     overwrite: bool = False
@@ -602,6 +626,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
         output_dir=output_dir,
         repo_id=output_repo_id,
         camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(),
+        depth_encoder=getattr(cfg.operation, "depth_encoder", None) or depth_encoder_defaults(),
         episode_indices=getattr(cfg.operation, "episode_indices", None),
         num_workers=getattr(cfg.operation, "num_workers", 4),
         max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
@@ -719,10 +744,14 @@ def handle_reencode_videos(cfg: EditDatasetConfig) -> None:
         shutil.copytree(input_root, output_root)
         dataset = LeRobotDataset(output_repo_id, root=output_root)
 
-    logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}")
+    logging.info(
+        f"Re-encoding videos in {output_repo_id} with RGB encoder {cfg.operation.camera_encoder} "
+        f"and depth encoder {cfg.operation.depth_encoder}"
+    )
     reencode_dataset(
         dataset,
         camera_encoder=cfg.operation.camera_encoder,
+        depth_encoder=cfg.operation.depth_encoder,
         encoder_threads=cfg.operation.encoder_threads,
         num_workers=cfg.operation.num_workers,
     )
diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py
index 0633ec900..440ea3bb8 100644
--- a/tests/datasets/test_dataset_tools.py
+++ b/tests/datasets/test_dataset_tools.py
@@ -1380,12 +1380,24 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
         mock_get_safe_version.return_value = "v3.0"
         mock_snapshot_download.return_value = str(output_dir)
 
+        # Use non-default quantization params so the persisted metadata must
+        # come from the depth encoder (not RGB encoder defaults).
+        depth_encoder = DepthEncoderConfig(
+            vcodec="hevc",
+            pix_fmt="gray12le",
+            g=2,
+            crf=30,
+            depth_min=0.05,
+            depth_max=8.0,
+            shift=2.0,
+            use_log=False,
+        )
         video_dataset = convert_image_to_video_dataset(
             dataset=source_dataset,
             output_dir=output_dir,
             repo_id="dummy/depth_video",
             camera_encoder=VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
-            depth_encoder=DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=2, crf=30),
+            depth_encoder=depth_encoder,
             num_workers=1,
         )
 
@@ -1398,6 +1410,18 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
     depth_path = video_dataset.root / video_dataset.meta.get_video_file_path(0, "observation.images.depth")
     assert depth_path.exists(), f"Depth video file should exist: {depth_path}"
 
+    # The persisted depth-video metadata must carry the depth quantization params
+    # from the depth encoder (so frames dequantize correctly on read), and the RGB
+    # camera must not be marked as a depth map.
+    persisted_info = load_info(video_dataset.root)
+    depth_info = persisted_info.features["observation.images.depth"]["info"]
+    assert depth_info["is_depth_map"] is True
+    assert DepthEncoderConfig.from_video_info(depth_info) == depth_encoder
+
+    cam_info = persisted_info.features["observation.images.cam"]["info"]
+    assert cam_info.get("is_depth_map") is False
+    assert "video.codec" in cam_info
+
 
 # ─── reencode_dataset ─────────────────────────────────────────────────
 
diff --git a/tests/scripts/test_edit_dataset_parsing.py b/tests/scripts/test_edit_dataset_parsing.py
index c90cffb38..4dfad81f9 100644
--- a/tests/scripts/test_edit_dataset_parsing.py
+++ b/tests/scripts/test_edit_dataset_parsing.py
@@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import (
     MergeConfig,
     ModifyTasksConfig,
     OperationConfig,
+    ReencodeVideosConfig,
     RemoveFeatureConfig,
     SplitConfig,
     _validate_config,
@@ -103,3 +104,47 @@ class TestOperationTypeParsing:
         )
         resolved_name = OperationConfig.get_choice_name(type(cfg.operation))
         assert resolved_name == type_name
+
+
+class TestDepthEncoderParsing:
+    """Test that the depth encoder is exposed and parsed for video operations."""
+
+    def test_reencode_has_default_depth_encoder(self):
+        cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", "reencode_videos"])
+        assert isinstance(cfg.operation, ReencodeVideosConfig)
+        # A depth encoder is configured by default so depth videos are re-encoded too.
+        assert cfg.operation.depth_encoder is not None
+        assert hasattr(cfg.operation.depth_encoder, "depth_min")
+
+    def test_reencode_parses_depth_encoder_overrides(self):
+        cfg = parse_cfg(
+            [
+                "--repo_id",
+                "test/repo",
+                "--operation.type",
+                "reencode_videos",
+                "--operation.depth_encoder.vcodec",
+                "ffv1",
+                "--operation.depth_encoder.depth_max",
+                "12.0",
+                "--operation.depth_encoder.use_log",
+                "false",
+            ]
+        )
+        assert cfg.operation.depth_encoder.vcodec == "ffv1"
+        assert cfg.operation.depth_encoder.depth_max == 12.0
+        assert cfg.operation.depth_encoder.use_log is False
+
+    def test_convert_image_to_video_parses_depth_encoder_overrides(self):
+        cfg = parse_cfg(
+            [
+                "--repo_id",
+                "test/repo",
+                "--operation.type",
+                "convert_image_to_video",
+                "--operation.depth_encoder.depth_min",
+                "0.05",
+            ]
+        )
+        assert isinstance(cfg.operation, ConvertImageToVideoConfig)
+        assert cfg.operation.depth_encoder.depth_min == 0.05