From 2b8d7b3c06799fe20ab544b641650a289317dd17 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Thu, 21 May 2026 16:11:01 +0200 Subject: [PATCH] fix(plumbing): fixing missing parts in the depth maps pipeline --- src/lerobot/configs/dataset.py | 4 +++- src/lerobot/datasets/dataset_writer.py | 16 ++++++++++++---- src/lerobot/datasets/depth_utils.py | 2 +- src/lerobot/datasets/video_utils.py | 3 +++ src/lerobot/rollout/context.py | 2 ++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py index d5c6fa312..ddb37822d 100644 --- a/src/lerobot/configs/dataset.py +++ b/src/lerobot/configs/dataset.py @@ -18,7 +18,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from .video import VideoEncoderConfig, camera_encoder_defaults +from .video import DepthEncoderConfig, VideoEncoderConfig, camera_encoder_defaults, depth_encoder_defaults @dataclass @@ -60,6 +60,8 @@ class DatasetRecordConfig: # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys, # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``). camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + # Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys. + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) # Enable streaming video encoding: encode frames in real-time during capture instead # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding streaming_encoding: bool = False diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index a79d0b57c..c51fe6706 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -77,7 +77,12 @@ def _encode_video_worker( encoder_threads: int | None = None, ) -> Path: temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" - fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) + path_template = ( + DEFAULT_DEPTH_PATH + if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig) + else DEFAULT_IMAGE_PATH + ) + fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0) img_dir = (root / fpath).parent encode_video_frames( img_dir, @@ -305,7 +310,9 @@ class DatasetWriter: episode_index, self._root, self._meta.fps, - self._camera_encoder, + self._depth_encoder + if video_key in self._meta.depth_keys + else self._camera_encoder, self._encoder_threads, ): video_key for video_key in self._meta.video_keys @@ -588,13 +595,14 @@ class DatasetWriter: self.image_writer.wait_until_done() def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: - """Use ffmpeg to convert frames stored as png into mp4 videos.""" + """Use ffmpeg to convert frames stored as png/tiff into mp4 videos.""" + is_depth = video_key in self._meta.depth_keys return _encode_video_worker( video_key, episode_index, self._root, self._meta.fps, - self._camera_encoder, + self._depth_encoder if is_depth else self._camera_encoder, self._encoder_threads, ) diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py index e7db76398..1c641f7f4 100644 --- a/src/lerobot/datasets/depth_utils.py +++ b/src/lerobot/datasets/depth_utils.py @@ -196,7 +196,7 @@ def dequantize_depth( else: depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False) - + # Add single-channel dim: (H, W) → (H, W, 1) if depth_m.ndim == 2: depth_m = depth_m[..., np.newaxis] diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index 3951e69d0..f21508b01 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -538,6 +538,7 @@ class _CameraEncoderThread(threading.Thread): frame_queue: queue.Queue, result_queue: queue.Queue, stop_event: threading.Event, + encoder_threads: int | None = None, ): super().__init__(daemon=True) self.video_path = video_path @@ -547,6 +548,7 @@ class _CameraEncoderThread(threading.Thread): self.frame_queue = frame_queue self.result_queue = result_queue self.stop_event = stop_event + self.encoder_threads = encoder_threads def run(self) -> None: from .compute_stats import RunningQuantileStats, auto_downsample_height_width @@ -723,6 +725,7 @@ class StreamingVideoEncoder: frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, + encoder_threads=self._encoder_threads, ) encoder_thread.start() diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py index bf5fa0fd4..527ba525a 100644 --- a/src/lerobot/rollout/context.py +++ b/src/lerobot/rollout/context.py @@ -333,6 +333,7 @@ def build_rollout_context( root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, camera_encoder=cfg.dataset.camera_encoder, + depth_encoder=cfg.dataset.depth_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads, @@ -368,6 +369,7 @@ def build_rollout_context( * len(robot.cameras if hasattr(robot, "cameras") else []), batch_encoding_size=cfg.dataset.video_encoding_batch_size, camera_encoder=cfg.dataset.camera_encoder, + depth_encoder=cfg.dataset.depth_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads,