feat(depth): plumb DepthEncoderConfig through LeRobotDataset and DatasetWriter

This commit is contained in:
CarolinePascal
2026-05-19 22:50:19 +02:00
parent 0cc5162078
commit b4c31f0f67
5 changed files with 47 additions and 25 deletions
+8 -3
View File
@@ -31,7 +31,7 @@ import PIL.Image
import pyarrow.parquet as pq
import torch
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, DepthEncoderConfig, depth_encoder_defaults
from .compute_stats import compute_episode_stats
from .dataset_metadata import LeRobotDatasetMetadata
@@ -67,7 +67,7 @@ def _encode_video_worker(
episode_index: int,
root: Path,
fps: int,
camera_encoder: VideoEncoderConfig | None = None,
video_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
@@ -77,7 +77,7 @@ def _encode_video_worker(
img_dir,
temp_path,
fps,
camera_encoder=camera_encoder,
video_encoder=video_encoder,
encoder_threads=encoder_threads,
overwrite=True,
)
@@ -97,6 +97,7 @@ class DatasetWriter:
meta: LeRobotDatasetMetadata,
root: Path,
camera_encoder: VideoEncoderConfig | None,
depth_encoder: DepthEncoderConfig | None,
encoder_threads: int | None,
batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None,
@@ -110,6 +111,8 @@ class DatasetWriter:
root: Local dataset root directory.
camera_encoder: Video encoder settings applied to all cameras.
``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
depth_encoder: Video encoder settings applied to all **depth** cameras.
``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
batch_encoding_size: Number of episodes to accumulate before
@@ -121,6 +124,7 @@ class DatasetWriter:
self._meta = meta
self._root = root
self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._depth_encoder = depth_encoder or depth_encoder_defaults()
self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder
@@ -195,6 +199,7 @@ class DatasetWriter:
if frame_index == 0 and self._streaming_encoder is not None:
self._streaming_encoder.start_episode(
video_keys=list(self._meta.video_keys),
depth_video_keys=set(self._meta.video_keys) & set(self._meta.depth_keys),
temp_dir=self._root,
)
+14 -2
View File
@@ -24,7 +24,7 @@ import torch.utils
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.errors import RevisionNotFoundError
from lerobot.configs import VideoEncoderConfig
from lerobot.configs import VideoEncoderConfig, DepthEncoderConfig
from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE
from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -60,6 +60,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
return_uint8: bool = False,
batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
encoder_threads: int | None = None,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
@@ -186,6 +187,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
(codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
is used by the writer.
depth_encoder (DepthEncoderConfig | None, optional): Video encoder settings for depth cameras
(codec, quality, etc.). When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults`
is used by the writer.
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
codec decide.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
@@ -273,6 +277,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = self._build_streaming_encoder(
self.meta.fps,
camera_encoder,
depth_encoder,
encoder_queue_maxsize,
encoder_threads,
)
@@ -280,6 +285,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
meta=self.meta,
root=self.root,
camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -322,12 +328,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
def _build_streaming_encoder(
fps: int,
camera_encoder: VideoEncoderConfig | None,
depth_encoder: DepthEncoderConfig | None,
encoder_queue_maxsize: int,
encoder_threads: int | None,
) -> StreamingVideoEncoder:
return StreamingVideoEncoder(
fps=fps,
camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads,
)
@@ -645,6 +653,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
metadata_buffer_size: int = 10,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
@@ -677,6 +686,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
batch-encoding videos. ``1`` means encode immediately.
camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` is used.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
metadata_buffer_size: Number of episode metadata records to buffer
@@ -720,12 +731,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(
fps, camera_encoder, encoder_queue_maxsize, encoder_threads
fps, camera_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads
)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
camera_encoder=camera_encoder,
depth_encoder=depth_encoder,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
+17 -12
View File
@@ -38,7 +38,9 @@ from PIL import Image
from lerobot.configs import (
VideoEncoderConfig,
DepthEncoderConfig,
camera_encoder_defaults,
depth_encoder_defaults,
)
from lerobot.utils.import_utils import get_safe_default_video_backend
@@ -335,17 +337,17 @@ def encode_video_frames(
imgs_dir: Path | str,
video_path: Path | str,
fps: int,
camera_encoder: VideoEncoderConfig | None = None,
video_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
*,
log_level: int | None = av.logging.WARNING,
overwrite: bool = False,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
if camera_encoder is None:
camera_encoder = camera_encoder_defaults()
vcodec = camera_encoder.vcodec
pix_fmt = camera_encoder.pix_fmt
if video_encoder is None:
video_encoder = camera_encoder_defaults()
vcodec = video_encoder.vcodec
pix_fmt = video_encoder.pix_fmt
video_path = Path(video_path)
imgs_dir = Path(imgs_dir)
@@ -367,7 +369,7 @@ def encode_video_frames(
with Image.open(input_list[0]) as dummy_image:
width, height = dummy_image.size
video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True)
video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True)
# Set logging level
if log_level is not None:
@@ -639,6 +641,7 @@ class StreamingVideoEncoder:
self,
fps: int,
camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
@@ -654,6 +657,7 @@ class StreamingVideoEncoder:
"""
self.fps = fps
self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._depth_encoder = depth_encoder or depth_encoder_defaults()
self._encoder_threads = encoder_threads
self.queue_maxsize = queue_maxsize
@@ -666,11 +670,12 @@ class StreamingVideoEncoder:
self._episode_active = False
self._closed = False
def start_episode(self, video_keys: list[str], temp_dir: Path) -> None:
def start_episode(self, video_keys: list[str], depth_video_keys: list[str], temp_dir: Path) -> None:
"""Start encoder threads for a new episode.
Args:
video_keys: List of video feature keys (e.g. ["observation.images.laptop"])
depth_video_keys: List of video feature keys that carry depth maps (e.g. ["observation.depth.laptop"])
temp_dir: Base directory for temporary MP4 files
"""
if self._episode_active:
@@ -678,7 +683,7 @@ class StreamingVideoEncoder:
self._dropped_frames.clear()
for video_key in video_keys:
for video_key in video_keys + depth_video_keys:
frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize)
result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event()
@@ -686,13 +691,13 @@ class StreamingVideoEncoder:
temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
vcodec = self._camera_encoder.vcodec
codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True)
encoder = self._depth_encoder if video_key in depth_video_keys else self._camera_encoder
codec_options = encoder.get_codec_options(self._encoder_threads, as_strings=True)
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=self.fps,
vcodec=vcodec,
pix_fmt=self._camera_encoder.pix_fmt,
vcodec=encoder.vcodec,
pix_fmt=encoder.pix_fmt,
codec_options=codec_options,
frame_queue=frame_queue,
result_queue=result_queue,
+7 -7
View File
@@ -53,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
# ── Existing encode_video_worker tests ───────────────────────────────
def test_encode_video_worker_forwards_camera_encoder(tmp_path):
"""_encode_video_worker forwards camera_encoder to encode_video_frames."""
def test_encode_video_worker_forwards_video_encoder(tmp_path):
"""_encode_video_worker forwards video_encoder to encode_video_frames."""
video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent
@@ -74,16 +74,16 @@ def test_encode_video_worker_forwards_camera_encoder(tmp_path):
0,
tmp_path,
fps=30,
camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
video_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
encoder_threads=4,
)
assert captured_kwargs["camera_encoder"].vcodec == "h264"
assert captured_kwargs["video_encoder"].vcodec == "h264"
assert captured_kwargs["encoder_threads"] == 4
def test_encode_video_worker_default_camera_encoder(tmp_path):
"""_encode_video_worker passes None camera_encoder which encode_video_frames defaults."""
def test_encode_video_worker_default_video_encoder(tmp_path):
"""_encode_video_worker passes None video_encoder which encode_video_frames defaults."""
video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent
@@ -100,7 +100,7 @@ def test_encode_video_worker_default_camera_encoder(tmp_path):
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
_encode_video_worker(video_key, 0, tmp_path, fps=30)
assert captured_kwargs["camera_encoder"] is None
assert captured_kwargs["video_encoder"] is None
assert captured_kwargs["encoder_threads"] is None
+1 -1
View File
@@ -338,7 +338,7 @@ def _encode_video(
) -> Path:
imgs_dir = path.parent / f"imgs_{path.stem}"
_write_frames(imgs_dir, num_frames=num_frames)
encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True)
encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True)
return path