fix(plumbing): fixing missing parts in the depth maps pipeline

This commit is contained in:
CarolinePascal
2026-05-21 16:11:01 +02:00
parent 4a49f4a391
commit 2b8d7b3c06
5 changed files with 21 additions and 6 deletions
+3 -1
View File
@@ -18,7 +18,7 @@ from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from .video import VideoEncoderConfig, camera_encoder_defaults
from .video import DepthEncoderConfig, VideoEncoderConfig, camera_encoder_defaults, depth_encoder_defaults
@dataclass
@@ -60,6 +60,8 @@ class DatasetRecordConfig:
# Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
# e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
# Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys.
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
# Enable streaming video encoding: encode frames in real-time during capture instead
# of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
streaming_encoding: bool = False
+12 -4
View File
@@ -77,7 +77,12 @@ def _encode_video_worker(
encoder_threads: int | None = None,
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
path_template = (
DEFAULT_DEPTH_PATH
if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig)
else DEFAULT_IMAGE_PATH
)
fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent
encode_video_frames(
img_dir,
@@ -305,7 +310,9 @@ class DatasetWriter:
episode_index,
self._root,
self._meta.fps,
self._camera_encoder,
self._depth_encoder
if video_key in self._meta.depth_keys
else self._camera_encoder,
self._encoder_threads,
): video_key
for video_key in self._meta.video_keys
@@ -588,13 +595,14 @@ class DatasetWriter:
self.image_writer.wait_until_done()
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
"""Use ffmpeg to convert frames stored as png/tiff into mp4 videos."""
is_depth = video_key in self._meta.depth_keys
return _encode_video_worker(
video_key,
episode_index,
self._root,
self._meta.fps,
self._camera_encoder,
self._depth_encoder if is_depth else self._camera_encoder,
self._encoder_threads,
)
+1 -1
View File
@@ -196,7 +196,7 @@ def dequantize_depth(
else:
depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m
depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False)
# Add single-channel dim: (H, W) → (H, W, 1)
if depth_m.ndim == 2:
depth_m = depth_m[..., np.newaxis]
+3
View File
@@ -538,6 +538,7 @@ class _CameraEncoderThread(threading.Thread):
frame_queue: queue.Queue,
result_queue: queue.Queue,
stop_event: threading.Event,
encoder_threads: int | None = None,
):
super().__init__(daemon=True)
self.video_path = video_path
@@ -547,6 +548,7 @@ class _CameraEncoderThread(threading.Thread):
self.frame_queue = frame_queue
self.result_queue = result_queue
self.stop_event = stop_event
self.encoder_threads = encoder_threads
def run(self) -> None:
from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -723,6 +725,7 @@ class StreamingVideoEncoder:
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
encoder_threads=self._encoder_threads,
)
encoder_thread.start()
+2
View File
@@ -333,6 +333,7 @@ def build_rollout_context(
root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
camera_encoder=cfg.dataset.camera_encoder,
depth_encoder=cfg.dataset.depth_encoder,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
@@ -368,6 +369,7 @@ def build_rollout_context(
* len(robot.cameras if hasattr(robot, "cameras") else []),
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
camera_encoder=cfg.dataset.camera_encoder,
depth_encoder=cfg.dataset.depth_encoder,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,