fix(plumbing): fixing missing parts in the depth maps pipeline

2026-05-22 20:19:43 +00:00 · 2026-05-21 16:11:01 +02:00
parent 4a49f4a391
commit 2b8d7b3c06
5 changed files with 21 additions and 6 deletions
@@ -18,7 +18,7 @@ from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path

-from .video import VideoEncoderConfig, camera_encoder_defaults
+from .video import DepthEncoderConfig, VideoEncoderConfig, camera_encoder_defaults, depth_encoder_defaults


@dataclass
@@ -60,6 +60,8 @@ class DatasetRecordConfig:
    # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
    # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
    camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
+    # Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys.
+    depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
    # Enable streaming video encoding: encode frames in real-time during capture instead
    # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
    streaming_encoding: bool = False
@@ -77,7 +77,12 @@ def _encode_video_worker(
    encoder_threads: int | None = None,
 ) -> Path:
    temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
-    fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
+    path_template = (
+        DEFAULT_DEPTH_PATH
+        if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig)
+        else DEFAULT_IMAGE_PATH
+    )
+    fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0)
    img_dir = (root / fpath).parent
    encode_video_frames(
        img_dir,
@@ -305,7 +310,9 @@ class DatasetWriter:
                            episode_index,
                            self._root,
                            self._meta.fps,
-                            self._camera_encoder,
+                            self._depth_encoder
+                            if video_key in self._meta.depth_keys
+                            else self._camera_encoder,
                            self._encoder_threads,
                        ): video_key
                        for video_key in self._meta.video_keys
@@ -588,13 +595,14 @@ class DatasetWriter:
            self.image_writer.wait_until_done()

    def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
-        """Use ffmpeg to convert frames stored as png into mp4 videos."""
+        """Use ffmpeg to convert frames stored as png/tiff into mp4 videos."""
+        is_depth = video_key in self._meta.depth_keys
        return _encode_video_worker(
            video_key,
            episode_index,
            self._root,
            self._meta.fps,
-            self._camera_encoder,
+            self._depth_encoder if is_depth else self._camera_encoder,
            self._encoder_threads,
        )

@@ -196,7 +196,7 @@ def dequantize_depth(
    else:
        depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m
    depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False)
-    
+
    # Add single-channel dim: (H, W) → (H, W, 1)
    if depth_m.ndim == 2:
        depth_m = depth_m[..., np.newaxis]
@@ -538,6 +538,7 @@ class _CameraEncoderThread(threading.Thread):
        frame_queue: queue.Queue,
        result_queue: queue.Queue,
        stop_event: threading.Event,
+        encoder_threads: int | None = None,
    ):
        super().__init__(daemon=True)
        self.video_path = video_path
@@ -547,6 +548,7 @@ class _CameraEncoderThread(threading.Thread):
        self.frame_queue = frame_queue
        self.result_queue = result_queue
        self.stop_event = stop_event
+        self.encoder_threads = encoder_threads

    def run(self) -> None:
        from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -723,6 +725,7 @@ class StreamingVideoEncoder:
                frame_queue=frame_queue,
                result_queue=result_queue,
                stop_event=stop_event,
+                encoder_threads=self._encoder_threads,
            )
            encoder_thread.start()

@@ -333,6 +333,7 @@ def build_rollout_context(
                root=cfg.dataset.root,
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                encoder_threads=cfg.dataset.encoder_threads,
@@ -368,6 +369,7 @@ def build_rollout_context(
                * len(robot.cameras if hasattr(robot, "cameras") else []),
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                encoder_threads=cfg.dataset.encoder_threads,