fix(plumbing): fixing missing parts in the depth maps pipeline

This commit is contained in:
CarolinePascal
2026-05-21 16:11:01 +02:00
parent 4a49f4a391
commit 2b8d7b3c06
5 changed files with 21 additions and 6 deletions
+3 -1
View File
@@ -18,7 +18,7 @@ from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from .video import VideoEncoderConfig, camera_encoder_defaults from .video import DepthEncoderConfig, VideoEncoderConfig, camera_encoder_defaults, depth_encoder_defaults
@dataclass @dataclass
@@ -60,6 +60,8 @@ class DatasetRecordConfig:
# Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys, # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
# e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``). # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
# Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys.
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
# Enable streaming video encoding: encode frames in real-time during capture instead # Enable streaming video encoding: encode frames in real-time during capture instead
# of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
streaming_encoding: bool = False streaming_encoding: bool = False
+12 -4
View File
@@ -77,7 +77,12 @@ def _encode_video_worker(
encoder_threads: int | None = None, encoder_threads: int | None = None,
) -> Path: ) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) path_template = (
DEFAULT_DEPTH_PATH
if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig)
else DEFAULT_IMAGE_PATH
)
fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent img_dir = (root / fpath).parent
encode_video_frames( encode_video_frames(
img_dir, img_dir,
@@ -305,7 +310,9 @@ class DatasetWriter:
episode_index, episode_index,
self._root, self._root,
self._meta.fps, self._meta.fps,
self._camera_encoder, self._depth_encoder
if video_key in self._meta.depth_keys
else self._camera_encoder,
self._encoder_threads, self._encoder_threads,
): video_key ): video_key
for video_key in self._meta.video_keys for video_key in self._meta.video_keys
@@ -588,13 +595,14 @@ class DatasetWriter:
self.image_writer.wait_until_done() self.image_writer.wait_until_done()
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos.""" """Use ffmpeg to convert frames stored as png/tiff into mp4 videos."""
is_depth = video_key in self._meta.depth_keys
return _encode_video_worker( return _encode_video_worker(
video_key, video_key,
episode_index, episode_index,
self._root, self._root,
self._meta.fps, self._meta.fps,
self._camera_encoder, self._depth_encoder if is_depth else self._camera_encoder,
self._encoder_threads, self._encoder_threads,
) )
+1 -1
View File
@@ -196,7 +196,7 @@ def dequantize_depth(
else: else:
depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m
depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False) depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False)
# Add single-channel dim: (H, W) → (H, W, 1) # Add single-channel dim: (H, W) → (H, W, 1)
if depth_m.ndim == 2: if depth_m.ndim == 2:
depth_m = depth_m[..., np.newaxis] depth_m = depth_m[..., np.newaxis]
+3
View File
@@ -538,6 +538,7 @@ class _CameraEncoderThread(threading.Thread):
frame_queue: queue.Queue, frame_queue: queue.Queue,
result_queue: queue.Queue, result_queue: queue.Queue,
stop_event: threading.Event, stop_event: threading.Event,
encoder_threads: int | None = None,
): ):
super().__init__(daemon=True) super().__init__(daemon=True)
self.video_path = video_path self.video_path = video_path
@@ -547,6 +548,7 @@ class _CameraEncoderThread(threading.Thread):
self.frame_queue = frame_queue self.frame_queue = frame_queue
self.result_queue = result_queue self.result_queue = result_queue
self.stop_event = stop_event self.stop_event = stop_event
self.encoder_threads = encoder_threads
def run(self) -> None: def run(self) -> None:
from .compute_stats import RunningQuantileStats, auto_downsample_height_width from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -723,6 +725,7 @@ class StreamingVideoEncoder:
frame_queue=frame_queue, frame_queue=frame_queue,
result_queue=result_queue, result_queue=result_queue,
stop_event=stop_event, stop_event=stop_event,
encoder_threads=self._encoder_threads,
) )
encoder_thread.start() encoder_thread.start()
+2
View File
@@ -333,6 +333,7 @@ def build_rollout_context(
root=cfg.dataset.root, root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size, batch_encoding_size=cfg.dataset.video_encoding_batch_size,
camera_encoder=cfg.dataset.camera_encoder, camera_encoder=cfg.dataset.camera_encoder,
depth_encoder=cfg.dataset.depth_encoder,
streaming_encoding=cfg.dataset.streaming_encoding, streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads, encoder_threads=cfg.dataset.encoder_threads,
@@ -368,6 +369,7 @@ def build_rollout_context(
* len(robot.cameras if hasattr(robot, "cameras") else []), * len(robot.cameras if hasattr(robot, "cameras") else []),
batch_encoding_size=cfg.dataset.video_encoding_batch_size, batch_encoding_size=cfg.dataset.video_encoding_batch_size,
camera_encoder=cfg.dataset.camera_encoder, camera_encoder=cfg.dataset.camera_encoder,
depth_encoder=cfg.dataset.depth_encoder,
streaming_encoding=cfg.dataset.streaming_encoding, streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads, encoder_threads=cfg.dataset.encoder_threads,