diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index 6be63194f..d3c811864 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -31,7 +31,7 @@ import PIL.Image import pyarrow.parquet as pq import torch -from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults +from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, DepthEncoderConfig, depth_encoder_defaults from .compute_stats import compute_episode_stats from .dataset_metadata import LeRobotDatasetMetadata @@ -67,7 +67,7 @@ def _encode_video_worker( episode_index: int, root: Path, fps: int, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, ) -> Path: temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" @@ -77,7 +77,7 @@ def _encode_video_worker( img_dir, temp_path, fps, - camera_encoder=camera_encoder, + video_encoder=video_encoder, encoder_threads=encoder_threads, overwrite=True, ) @@ -97,6 +97,7 @@ class DatasetWriter: meta: LeRobotDatasetMetadata, root: Path, camera_encoder: VideoEncoderConfig | None, + depth_encoder: DepthEncoderConfig | None, encoder_threads: int | None, batch_encoding_size: int, streaming_encoder: StreamingVideoEncoder | None = None, @@ -110,6 +111,8 @@ class DatasetWriter: root: Local dataset root directory. camera_encoder: Video encoder settings applied to all cameras. ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`. + depth_encoder: Video encoder settings applied to all **depth** cameras. + ``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`. encoder_threads: Number of encoder threads (global). ``None`` lets the codec decide. batch_encoding_size: Number of episodes to accumulate before @@ -121,6 +124,7 @@ class DatasetWriter: self._meta = meta self._root = root self._camera_encoder = camera_encoder or camera_encoder_defaults() + self._depth_encoder = depth_encoder or depth_encoder_defaults() self._encoder_threads = encoder_threads self._batch_encoding_size = batch_encoding_size self._streaming_encoder = streaming_encoder @@ -195,6 +199,7 @@ class DatasetWriter: if frame_index == 0 and self._streaming_encoder is not None: self._streaming_encoder.start_episode( video_keys=list(self._meta.video_keys), + depth_video_keys=set(self._meta.video_keys) & set(self._meta.depth_keys), temp_dir=self._root, ) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 9734bcc74..615be5df7 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -24,7 +24,7 @@ import torch.utils from huggingface_hub import HfApi, snapshot_download from huggingface_hub.errors import RevisionNotFoundError -from lerobot.configs import VideoEncoderConfig +from lerobot.configs import VideoEncoderConfig, DepthEncoderConfig from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata @@ -60,6 +60,7 @@ class LeRobotDataset(torch.utils.data.Dataset): return_uint8: bool = False, batch_encoding_size: int = 1, camera_encoder: VideoEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, encoder_threads: int | None = None, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -186,6 +187,9 @@ class LeRobotDataset(torch.utils.data.Dataset): camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used by the writer. + depth_encoder (DepthEncoderConfig | None, optional): Video encoder settings for depth cameras + (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` + is used by the writer. encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the codec decide. streaming_encoding (bool, optional): If True, encode video frames in real-time during capture @@ -273,6 +277,7 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = self._build_streaming_encoder( self.meta.fps, camera_encoder, + depth_encoder, encoder_queue_maxsize, encoder_threads, ) @@ -280,6 +285,7 @@ class LeRobotDataset(torch.utils.data.Dataset): meta=self.meta, root=self.root, camera_encoder=camera_encoder, + depth_encoder=depth_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -322,12 +328,14 @@ class LeRobotDataset(torch.utils.data.Dataset): def _build_streaming_encoder( fps: int, camera_encoder: VideoEncoderConfig | None, + depth_encoder: DepthEncoderConfig | None, encoder_queue_maxsize: int, encoder_threads: int | None, ) -> StreamingVideoEncoder: return StreamingVideoEncoder( fps=fps, camera_encoder=camera_encoder, + depth_encoder=depth_encoder, queue_maxsize=encoder_queue_maxsize, encoder_threads=encoder_threads, ) @@ -645,6 +653,7 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: str | None = None, batch_encoding_size: int = 1, camera_encoder: VideoEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, metadata_buffer_size: int = 10, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -677,6 +686,8 @@ class LeRobotDataset(torch.utils.data.Dataset): batch-encoding videos. ``1`` means encode immediately. camera_encoder: Video encoder settings for cameras (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. + depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` is used. encoder_threads: Number of encoder threads (global). ``None`` lets the codec decide. metadata_buffer_size: Number of episode metadata records to buffer @@ -720,12 +731,13 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: streaming_enc = cls._build_streaming_encoder( - fps, camera_encoder, encoder_queue_maxsize, encoder_threads + fps, camera_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, camera_encoder=camera_encoder, + depth_encoder=depth_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index e823a406c..9fd29e138 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -38,7 +38,9 @@ from PIL import Image from lerobot.configs import ( VideoEncoderConfig, + DepthEncoderConfig, camera_encoder_defaults, + depth_encoder_defaults, ) from lerobot.utils.import_utils import get_safe_default_video_backend @@ -335,17 +337,17 @@ def encode_video_frames( imgs_dir: Path | str, video_path: Path | str, fps: int, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, *, log_level: int | None = av.logging.WARNING, overwrite: bool = False, ) -> None: """More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" - if camera_encoder is None: - camera_encoder = camera_encoder_defaults() - vcodec = camera_encoder.vcodec - pix_fmt = camera_encoder.pix_fmt + if video_encoder is None: + video_encoder = camera_encoder_defaults() + vcodec = video_encoder.vcodec + pix_fmt = video_encoder.pix_fmt video_path = Path(video_path) imgs_dir = Path(imgs_dir) @@ -367,7 +369,7 @@ def encode_video_frames( with Image.open(input_list[0]) as dummy_image: width, height = dummy_image.size - video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True) + video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True) # Set logging level if log_level is not None: @@ -639,6 +641,7 @@ class StreamingVideoEncoder: self, fps: int, camera_encoder: VideoEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, queue_maxsize: int = 30, encoder_threads: int | None = None, ): @@ -654,6 +657,7 @@ class StreamingVideoEncoder: """ self.fps = fps self._camera_encoder = camera_encoder or camera_encoder_defaults() + self._depth_encoder = depth_encoder or depth_encoder_defaults() self._encoder_threads = encoder_threads self.queue_maxsize = queue_maxsize @@ -666,11 +670,12 @@ class StreamingVideoEncoder: self._episode_active = False self._closed = False - def start_episode(self, video_keys: list[str], temp_dir: Path) -> None: + def start_episode(self, video_keys: list[str], depth_video_keys: list[str], temp_dir: Path) -> None: """Start encoder threads for a new episode. Args: video_keys: List of video feature keys (e.g. ["observation.images.laptop"]) + depth_video_keys: List of video feature keys that carry depth maps (e.g. ["observation.depth.laptop"]) temp_dir: Base directory for temporary MP4 files """ if self._episode_active: @@ -678,7 +683,7 @@ class StreamingVideoEncoder: self._dropped_frames.clear() - for video_key in video_keys: + for video_key in video_keys + depth_video_keys: frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize) result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() @@ -686,13 +691,13 @@ class StreamingVideoEncoder: temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir)) video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4" - vcodec = self._camera_encoder.vcodec - codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True) + encoder = self._depth_encoder if video_key in depth_video_keys else self._camera_encoder + codec_options = encoder.get_codec_options(self._encoder_threads, as_strings=True) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=self.fps, - vcodec=vcodec, - pix_fmt=self._camera_encoder.pix_fmt, + vcodec=encoder.vcodec, + pix_fmt=encoder.pix_fmt, codec_options=codec_options, frame_queue=frame_queue, result_queue=result_queue, diff --git a/tests/datasets/test_dataset_writer.py b/tests/datasets/test_dataset_writer.py index 8670aeebc..17785ad74 100644 --- a/tests/datasets/test_dataset_writer.py +++ b/tests/datasets/test_dataset_writer.py @@ -53,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict: # ── Existing encode_video_worker tests ─────────────────────────────── -def test_encode_video_worker_forwards_camera_encoder(tmp_path): - """_encode_video_worker forwards camera_encoder to encode_video_frames.""" +def test_encode_video_worker_forwards_video_encoder(tmp_path): + """_encode_video_worker forwards video_encoder to encode_video_frames.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -74,16 +74,16 @@ def test_encode_video_worker_forwards_camera_encoder(tmp_path): 0, tmp_path, fps=30, - camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None), + video_encoder=VideoEncoderConfig(vcodec="h264", preset=None), encoder_threads=4, ) - assert captured_kwargs["camera_encoder"].vcodec == "h264" + assert captured_kwargs["video_encoder"].vcodec == "h264" assert captured_kwargs["encoder_threads"] == 4 -def test_encode_video_worker_default_camera_encoder(tmp_path): - """_encode_video_worker passes None camera_encoder which encode_video_frames defaults.""" +def test_encode_video_worker_default_video_encoder(tmp_path): + """_encode_video_worker passes None video_encoder which encode_video_frames defaults.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -100,7 +100,7 @@ def test_encode_video_worker_default_camera_encoder(tmp_path): with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode): _encode_video_worker(video_key, 0, tmp_path, fps=30) - assert captured_kwargs["camera_encoder"] is None + assert captured_kwargs["video_encoder"] is None assert captured_kwargs["encoder_threads"] is None diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py index 224f2405b..7e33aa63f 100644 --- a/tests/datasets/test_video_encoding.py +++ b/tests/datasets/test_video_encoding.py @@ -338,7 +338,7 @@ def _encode_video( ) -> Path: imgs_dir = path.parent / f"imgs_{path.stem}" _write_frames(imgs_dir, num_frames=num_frames) - encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True) + encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True) return path