feat(depth): plumb DepthEncoderConfig through LeRobotDataset and DatasetWriter

This commit is contained in:
CarolinePascal
2026-05-19 22:50:19 +02:00
parent 0cc5162078
commit b4c31f0f67
5 changed files with 47 additions and 25 deletions
+8 -3
View File
@@ -31,7 +31,7 @@ import PIL.Image
import pyarrow.parquet as pq import pyarrow.parquet as pq
import torch import torch
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, DepthEncoderConfig, depth_encoder_defaults
from .compute_stats import compute_episode_stats from .compute_stats import compute_episode_stats
from .dataset_metadata import LeRobotDatasetMetadata from .dataset_metadata import LeRobotDatasetMetadata
@@ -67,7 +67,7 @@ def _encode_video_worker(
episode_index: int, episode_index: int,
root: Path, root: Path,
fps: int, fps: int,
camera_encoder: VideoEncoderConfig | None = None, video_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None, encoder_threads: int | None = None,
) -> Path: ) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
@@ -77,7 +77,7 @@ def _encode_video_worker(
img_dir, img_dir,
temp_path, temp_path,
fps, fps,
camera_encoder=camera_encoder, video_encoder=video_encoder,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
overwrite=True, overwrite=True,
) )
@@ -97,6 +97,7 @@ class DatasetWriter:
meta: LeRobotDatasetMetadata, meta: LeRobotDatasetMetadata,
root: Path, root: Path,
camera_encoder: VideoEncoderConfig | None, camera_encoder: VideoEncoderConfig | None,
depth_encoder: DepthEncoderConfig | None,
encoder_threads: int | None, encoder_threads: int | None,
batch_encoding_size: int, batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None, streaming_encoder: StreamingVideoEncoder | None = None,
@@ -110,6 +111,8 @@ class DatasetWriter:
root: Local dataset root directory. root: Local dataset root directory.
camera_encoder: Video encoder settings applied to all cameras. camera_encoder: Video encoder settings applied to all cameras.
``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`. ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
depth_encoder: Video encoder settings applied to all **depth** cameras.
``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`.
encoder_threads: Number of encoder threads (global). ``None`` encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide. lets the codec decide.
batch_encoding_size: Number of episodes to accumulate before batch_encoding_size: Number of episodes to accumulate before
@@ -121,6 +124,7 @@ class DatasetWriter:
self._meta = meta self._meta = meta
self._root = root self._root = root
self._camera_encoder = camera_encoder or camera_encoder_defaults() self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._depth_encoder = depth_encoder or depth_encoder_defaults()
self._encoder_threads = encoder_threads self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder self._streaming_encoder = streaming_encoder
@@ -195,6 +199,7 @@ class DatasetWriter:
if frame_index == 0 and self._streaming_encoder is not None: if frame_index == 0 and self._streaming_encoder is not None:
self._streaming_encoder.start_episode( self._streaming_encoder.start_episode(
video_keys=list(self._meta.video_keys), video_keys=list(self._meta.video_keys),
depth_video_keys=set(self._meta.video_keys) & set(self._meta.depth_keys),
temp_dir=self._root, temp_dir=self._root,
) )
+14 -2
View File
@@ -24,7 +24,7 @@ import torch.utils
from huggingface_hub import HfApi, snapshot_download from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.errors import RevisionNotFoundError from huggingface_hub.errors import RevisionNotFoundError
from lerobot.configs import VideoEncoderConfig from lerobot.configs import VideoEncoderConfig, DepthEncoderConfig
from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE
from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -60,6 +60,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
return_uint8: bool = False, return_uint8: bool = False,
batch_encoding_size: int = 1, batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None, camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
encoder_threads: int | None = None, encoder_threads: int | None = None,
streaming_encoding: bool = False, streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30, encoder_queue_maxsize: int = 30,
@@ -186,6 +187,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
(codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
is used by the writer. is used by the writer.
depth_encoder (DepthEncoderConfig | None, optional): Video encoder settings for depth cameras
(codec, quality, etc.). When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults`
is used by the writer.
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
codec decide. codec decide.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
@@ -273,6 +277,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = self._build_streaming_encoder( streaming_enc = self._build_streaming_encoder(
self.meta.fps, self.meta.fps,
camera_encoder, camera_encoder,
depth_encoder,
encoder_queue_maxsize, encoder_queue_maxsize,
encoder_threads, encoder_threads,
) )
@@ -280,6 +285,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
meta=self.meta, meta=self.meta,
root=self.root, root=self.root,
camera_encoder=camera_encoder, camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size, batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc, streaming_encoder=streaming_enc,
@@ -322,12 +328,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
def _build_streaming_encoder( def _build_streaming_encoder(
fps: int, fps: int,
camera_encoder: VideoEncoderConfig | None, camera_encoder: VideoEncoderConfig | None,
depth_encoder: DepthEncoderConfig | None,
encoder_queue_maxsize: int, encoder_queue_maxsize: int,
encoder_threads: int | None, encoder_threads: int | None,
) -> StreamingVideoEncoder: ) -> StreamingVideoEncoder:
return StreamingVideoEncoder( return StreamingVideoEncoder(
fps=fps, fps=fps,
camera_encoder=camera_encoder, camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
queue_maxsize=encoder_queue_maxsize, queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
) )
@@ -645,6 +653,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None, video_backend: str | None = None,
batch_encoding_size: int = 1, batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None, camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
metadata_buffer_size: int = 10, metadata_buffer_size: int = 10,
streaming_encoding: bool = False, streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30, encoder_queue_maxsize: int = 30,
@@ -677,6 +686,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
batch-encoding videos. ``1`` means encode immediately. batch-encoding videos. ``1`` means encode immediately.
camera_encoder: Video encoder settings for cameras (codec, quality, etc.). camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` is used.
encoder_threads: Number of encoder threads (global). ``None`` encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide. lets the codec decide.
metadata_buffer_size: Number of episode metadata records to buffer metadata_buffer_size: Number of episode metadata records to buffer
@@ -720,12 +731,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0: if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder( streaming_enc = cls._build_streaming_encoder(
fps, camera_encoder, encoder_queue_maxsize, encoder_threads fps, camera_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads
) )
obj.writer = DatasetWriter( obj.writer = DatasetWriter(
meta=obj.meta, meta=obj.meta,
root=obj.root, root=obj.root,
camera_encoder=camera_encoder, camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size, batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc, streaming_encoder=streaming_enc,
+17 -12
View File
@@ -38,7 +38,9 @@ from PIL import Image
from lerobot.configs import ( from lerobot.configs import (
VideoEncoderConfig, VideoEncoderConfig,
DepthEncoderConfig,
camera_encoder_defaults, camera_encoder_defaults,
depth_encoder_defaults,
) )
from lerobot.utils.import_utils import get_safe_default_video_backend from lerobot.utils.import_utils import get_safe_default_video_backend
@@ -335,17 +337,17 @@ def encode_video_frames(
imgs_dir: Path | str, imgs_dir: Path | str,
video_path: Path | str, video_path: Path | str,
fps: int, fps: int,
camera_encoder: VideoEncoderConfig | None = None, video_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None, encoder_threads: int | None = None,
*, *,
log_level: int | None = av.logging.WARNING, log_level: int | None = av.logging.WARNING,
overwrite: bool = False, overwrite: bool = False,
) -> None: ) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
if camera_encoder is None: if video_encoder is None:
camera_encoder = camera_encoder_defaults() video_encoder = camera_encoder_defaults()
vcodec = camera_encoder.vcodec vcodec = video_encoder.vcodec
pix_fmt = camera_encoder.pix_fmt pix_fmt = video_encoder.pix_fmt
video_path = Path(video_path) video_path = Path(video_path)
imgs_dir = Path(imgs_dir) imgs_dir = Path(imgs_dir)
@@ -367,7 +369,7 @@ def encode_video_frames(
with Image.open(input_list[0]) as dummy_image: with Image.open(input_list[0]) as dummy_image:
width, height = dummy_image.size width, height = dummy_image.size
video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True) video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True)
# Set logging level # Set logging level
if log_level is not None: if log_level is not None:
@@ -639,6 +641,7 @@ class StreamingVideoEncoder:
self, self,
fps: int, fps: int,
camera_encoder: VideoEncoderConfig | None = None, camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
queue_maxsize: int = 30, queue_maxsize: int = 30,
encoder_threads: int | None = None, encoder_threads: int | None = None,
): ):
@@ -654,6 +657,7 @@ class StreamingVideoEncoder:
""" """
self.fps = fps self.fps = fps
self._camera_encoder = camera_encoder or camera_encoder_defaults() self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._depth_encoder = depth_encoder or depth_encoder_defaults()
self._encoder_threads = encoder_threads self._encoder_threads = encoder_threads
self.queue_maxsize = queue_maxsize self.queue_maxsize = queue_maxsize
@@ -666,11 +670,12 @@ class StreamingVideoEncoder:
self._episode_active = False self._episode_active = False
self._closed = False self._closed = False
def start_episode(self, video_keys: list[str], temp_dir: Path) -> None: def start_episode(self, video_keys: list[str], depth_video_keys: list[str], temp_dir: Path) -> None:
"""Start encoder threads for a new episode. """Start encoder threads for a new episode.
Args: Args:
video_keys: List of video feature keys (e.g. ["observation.images.laptop"]) video_keys: List of video feature keys (e.g. ["observation.images.laptop"])
depth_video_keys: List of video feature keys that carry depth maps (e.g. ["observation.depth.laptop"])
temp_dir: Base directory for temporary MP4 files temp_dir: Base directory for temporary MP4 files
""" """
if self._episode_active: if self._episode_active:
@@ -678,7 +683,7 @@ class StreamingVideoEncoder:
self._dropped_frames.clear() self._dropped_frames.clear()
for video_key in video_keys: for video_key in video_keys + depth_video_keys:
frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize) frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize)
result_queue: queue.Queue = queue.Queue(maxsize=1) result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event() stop_event = threading.Event()
@@ -686,13 +691,13 @@ class StreamingVideoEncoder:
temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir)) temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4" video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
vcodec = self._camera_encoder.vcodec encoder = self._depth_encoder if video_key in depth_video_keys else self._camera_encoder
codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True) codec_options = encoder.get_codec_options(self._encoder_threads, as_strings=True)
encoder_thread = _CameraEncoderThread( encoder_thread = _CameraEncoderThread(
video_path=video_path, video_path=video_path,
fps=self.fps, fps=self.fps,
vcodec=vcodec, vcodec=encoder.vcodec,
pix_fmt=self._camera_encoder.pix_fmt, pix_fmt=encoder.pix_fmt,
codec_options=codec_options, codec_options=codec_options,
frame_queue=frame_queue, frame_queue=frame_queue,
result_queue=result_queue, result_queue=result_queue,
+7 -7
View File
@@ -53,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
# ── Existing encode_video_worker tests ─────────────────────────────── # ── Existing encode_video_worker tests ───────────────────────────────
def test_encode_video_worker_forwards_camera_encoder(tmp_path): def test_encode_video_worker_forwards_video_encoder(tmp_path):
"""_encode_video_worker forwards camera_encoder to encode_video_frames.""" """_encode_video_worker forwards video_encoder to encode_video_frames."""
video_key = "observation.images.laptop" video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent img_dir = tmp_path / Path(fpath).parent
@@ -74,16 +74,16 @@ def test_encode_video_worker_forwards_camera_encoder(tmp_path):
0, 0,
tmp_path, tmp_path,
fps=30, fps=30,
camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None), video_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
encoder_threads=4, encoder_threads=4,
) )
assert captured_kwargs["camera_encoder"].vcodec == "h264" assert captured_kwargs["video_encoder"].vcodec == "h264"
assert captured_kwargs["encoder_threads"] == 4 assert captured_kwargs["encoder_threads"] == 4
def test_encode_video_worker_default_camera_encoder(tmp_path): def test_encode_video_worker_default_video_encoder(tmp_path):
"""_encode_video_worker passes None camera_encoder which encode_video_frames defaults.""" """_encode_video_worker passes None video_encoder which encode_video_frames defaults."""
video_key = "observation.images.laptop" video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent img_dir = tmp_path / Path(fpath).parent
@@ -100,7 +100,7 @@ def test_encode_video_worker_default_camera_encoder(tmp_path):
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode): with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
_encode_video_worker(video_key, 0, tmp_path, fps=30) _encode_video_worker(video_key, 0, tmp_path, fps=30)
assert captured_kwargs["camera_encoder"] is None assert captured_kwargs["video_encoder"] is None
assert captured_kwargs["encoder_threads"] is None assert captured_kwargs["encoder_threads"] is None
+1 -1
View File
@@ -338,7 +338,7 @@ def _encode_video(
) -> Path: ) -> Path:
imgs_dir = path.parent / f"imgs_{path.stem}" imgs_dir = path.parent / f"imgs_{path.stem}"
_write_frames(imgs_dir, num_frames=num_frames) _write_frames(imgs_dir, num_frames=num_frames)
encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True) encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True)
return path return path