fix(imports): fixing av import in test_depth.py

tests(typos): fixing typos in tests
fix(info): fixing info metadata update when is_depth_map was set
2026-05-24 13:09:43 +00:00 · 2026-05-22 15:13:15 +02:00 · 2026-05-22 13:09:56 +02:00 · 2026-05-22 02:48:30 +02:00 · 2026-05-22 02:07:33 +02:00 · 2026-05-22 02:06:37 +02:00
32 changed files with 1100 additions and 140 deletions
@@ -82,7 +82,7 @@ After the first episode of a video stream is encoded, the encoder configuration
        "video.pix_fmt": "yuv420p",
        "video.fps": 30,
        "video.channels": 3,
-        "video.is_depth_map": false,
+        "is_depth_map": false,
        "video.g": 2,
        "video.crf": 30,
        "video.preset": "fast",
@@ -97,7 +97,7 @@ After the first episode of a video stream is encoded, the encoder configuration

 Two sources contribute to the `info` block:

- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present.
+- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `is_depth_map`, plus `audio.*` if an audio stream is present.
 - **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`.

 <Tip>
@@ -430,7 +430,7 @@ class OpenCVCamera(Camera):
        Internal loop run by the background thread for asynchronous reading.

        On each iteration:
-        1. Reads a color frame
+        1. Reads a color frame (blocking call)
        2. Stores result in latest_frame and updates timestamp (thread-safe)
        3. Sets new_frame_event to notify listeners

@@ -439,8 +439,9 @@ class OpenCVCamera(Camera):
        if self.stop_event is None:
            raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.")

+        stop_event = self.stop_event
        failure_count = 0
-        while not self.stop_event.is_set():
+        while not stop_event.is_set():
            try:
                raw_frame = self._read_from_hardware()
                processed_frame = self._postprocess_image(raw_frame)
@@ -478,6 +479,8 @@ class OpenCVCamera(Camera):

        if self.thread is not None and self.thread.is_alive():
            self.thread.join(timeout=2.0)
+            if self.thread.is_alive():
+                logger.warning(f"{self} read thread did not terminate within timeout.")

        self.thread = None
        self.stop_event = None
@@ -332,8 +332,8 @@ class RealSenseCamera(Camera):
        from the camera hardware via the RealSense pipeline.

        Returns:
-            np.ndarray: The depth map as a NumPy array (height, width)
-                  of type `np.uint16` (raw depth values in millimeters) and rotation.
+            np.ndarray: The depth map as a NumPy array (height, width, 1)
+                  of type `np.uint16` (raw depth values in millimeters).

        Raises:
            DeviceNotConnectedError: If the camera is not connected.
@@ -465,8 +465,8 @@ class RealSenseCamera(Camera):
        Internal loop run by the background thread for asynchronous reading.

        On each iteration:
-        1. Reads a color frame with 500ms timeout
-        2. Stores result in latest_frame and updates timestamp (thread-safe)
+        1. Reads a color/depth frame (blocking call with 10s timeout)
+        2. Stores result in latest_color_frame/latest_depth_frame and updates timestamp (thread-safe)
        3. Sets new_frame_event to notify listeners

        Stops on DeviceNotConnectedError, logs other errors and continues.
@@ -474,8 +474,9 @@ class RealSenseCamera(Camera):
        if self.stop_event is None:
            raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.")

+        stop_event = self.stop_event
        failure_count = 0
-        while not self.stop_event.is_set():
+        while not stop_event.is_set():
            try:
                frame = self._read_from_hardware()
                color_frame_raw = frame.get_color_frame()
@@ -486,6 +487,8 @@ class RealSenseCamera(Camera):
                    depth_frame_raw = frame.get_depth_frame()
                    depth_frame = np.asanyarray(depth_frame_raw.get_data())
                    processed_depth_frame = self._postprocess_image(depth_frame, depth_frame=True)
+                    if processed_depth_frame.ndim == 2:  # (H, W) -> (H, W, 1)
+                        processed_depth_frame = processed_depth_frame[..., np.newaxis]

                capture_time = time.perf_counter()

@@ -522,6 +525,8 @@ class RealSenseCamera(Camera):

        if self.thread is not None and self.thread.is_alive():
            self.thread.join(timeout=2.0)
+            if self.thread.is_alive():  # pragma: no cover
+                logger.warning(f"{self} read thread did not terminate within timeout.")

        self.thread = None
        self.stop_event = None
@@ -532,7 +537,6 @@ class RealSenseCamera(Camera):
            self.latest_timestamp = None
            self.new_frame_event.clear()

-    # NOTE(Steven): Missing implementation for depth for now
    @check_if_not_connected
    def async_read(self, timeout_ms: float = 200) -> NDArray[Any]:
        """
@@ -575,7 +579,6 @@ class RealSenseCamera(Camera):

        return frame

-    # NOTE(Steven): Missing implementation for depth for now
    @check_if_not_connected
    def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]:
        """Return the most recent (color) frame captured immediately (Peeking).
@@ -611,6 +614,71 @@ class RealSenseCamera(Camera):

        return frame

+    @check_if_not_connected
+    def async_read_depth(self, timeout_ms: float = 200) -> NDArray[Any]:
+        """Read the latest depth frame asynchronously, in metric meters.
+
+        Mirrors :meth:`async_read` but returns the depth stream rather than the
+        color stream. Output is ``np.uint16`` of shape ``(H, W, 1)``.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If ``use_depth`` is ``False`` for this camera, or if
+                the background read thread is not running.
+            TimeoutError: If no frame becomes available within ``timeout_ms``.
+        """
+        if not self.use_depth:
+            raise RuntimeError(f"{self}: cannot read depth — camera was configured with use_depth=False.")
+
+        if self.thread is None or not self.thread.is_alive():
+            raise RuntimeError(f"{self} read thread is not running.")
+
+        if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0):
+            raise TimeoutError(f"Timed out waiting for depth frame from camera {self} after {timeout_ms} ms.")
+
+        with self.frame_lock:
+            depth_frame = self.latest_depth_frame
+            self.new_frame_event.clear()
+
+        if depth_frame is None:
+            raise RuntimeError(f"Internal error: Event set but no depth frame available for {self}.")
+
+        return depth_frame
+
+    @check_if_not_connected
+    def read_latest_depth(self, max_age_ms: int = 500) -> NDArray[Any]:
+        """Return the most recent depth frame in metric meters (peeking).
+
+        Non-blocking counterpart of :meth:`read_latest` for the depth stream.
+        Output is ``np.uint16`` of shape ``(H, W, 1)`` in millimeters.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If ``use_depth`` is ``False`` for this camera, or if
+                no depth frame has been captured yet.
+            TimeoutError: If the latest depth frame is older than ``max_age_ms``.
+        """
+        if not self.use_depth:
+            raise RuntimeError(f"{self}: cannot read depth — camera was configured with use_depth=False.")
+
+        if self.thread is None or not self.thread.is_alive():
+            raise RuntimeError(f"{self} read thread is not running.")
+
+        with self.frame_lock:
+            depth_frame = self.latest_depth_frame
+            timestamp = self.latest_timestamp
+
+        if depth_frame is None or timestamp is None:
+            raise RuntimeError(f"{self} has not captured any depth frames yet.")
+
+        age_ms = (time.perf_counter() - timestamp) * 1e3
+        if age_ms > max_age_ms:
+            raise TimeoutError(
+                f"{self} latest depth frame is too old: {age_ms:.1f} ms (max allowed: {max_age_ms} ms)."
+            )
+
+        return depth_frame
+
    def disconnect(self) -> None:
        """
        Disconnects from the camera, stops the pipeline, and cleans up resources.
@@ -249,8 +249,9 @@ class ZMQCamera(Camera):
        if self.stop_event is None:
            raise RuntimeError(f"{self}: stop_event is not initialized.")

+        stop_event = self.stop_event
        failure_count = 0
-        while not self.stop_event.is_set():
+        while not stop_event.is_set():
            try:
                frame = self._read_from_hardware()
                capture_time = time.perf_counter()
@@ -292,6 +293,8 @@ class ZMQCamera(Camera):

        if self.thread is not None and self.thread.is_alive():
            self.thread.join(timeout=2.0)
+            if self.thread.is_alive():
+                logger.warning(f"{self} read thread did not terminate within timeout.")

        self.thread = None
        self.stop_event = None
@@ -34,8 +34,10 @@ from .types import (
 from .video import (
    VALID_VIDEO_CODECS,
    VIDEO_ENCODER_INFO_KEYS,
+    DepthEncoderConfig,
    VideoEncoderConfig,
    camera_encoder_defaults,
+    depth_encoder_defaults,
 )

 __all__ = [
@@ -53,8 +55,10 @@ __all__ = [
    "PreTrainedConfig",
    "WandBConfig",
    "VideoEncoderConfig",
+    "DepthEncoderConfig",
    # Defaults
    "camera_encoder_defaults",
+    "depth_encoder_defaults",
    # Constants
    "VALID_VIDEO_CODECS",
    "VIDEO_ENCODER_INFO_KEYS",
@@ -18,7 +18,7 @@ from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path

-from .video import VideoEncoderConfig, camera_encoder_defaults
+from .video import DepthEncoderConfig, VideoEncoderConfig, camera_encoder_defaults, depth_encoder_defaults


@dataclass
@@ -60,6 +60,8 @@ class DatasetRecordConfig:
    # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
    # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
    camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
+    # Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys.
+    depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
    # Enable streaming video encoding: encode frames in real-time during capture instead
    # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
    streaming_encoding: bool = False
@@ -19,8 +19,8 @@
 from __future__ import annotations

 import logging
-from dataclasses import dataclass, field
-from typing import Any
+from dataclasses import dataclass, field, fields
+from typing import Any, ClassVar

 from lerobot.utils.import_utils import require_package

@@ -36,11 +36,12 @@ HW_VIDEO_CODECS = [
    "h264_vaapi",  # Linux Intel/AMD
    "h264_qsv",  # Intel Quick Sync
 ]
-VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
+VALID_VIDEO_CODECS: frozenset[str] = frozenset(
+    {"h264", "hevc", "libsvtav1", "ffv1", "auto", *HW_VIDEO_CODECS}
+)
 # Aliases for legacy video codec names.
 VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}

-
 LIBSVTAV1_DEFAULT_PRESET: int = 12

 # Keys persisted under ``features[*]["info"]`` as ``video.<name>`` (from :class:`VideoEncoderConfig`).
@@ -52,6 +53,19 @@ VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset(
    f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES
 )

+# Default depth quantization and encoding parameters.
+DEPTH_QUANT_BITS: int = 12
+DEPTH_QMAX: int = (1 << DEPTH_QUANT_BITS) - 1  # 4095
+
+DEFAULT_DEPTH_MIN: float = 0.01
+DEFAULT_DEPTH_MAX: float = 10.0
+DEFAULT_DEPTH_SHIFT: float = 3.5
+DEFAULT_DEPTH_USE_LOG: bool = True
+DEFAULT_DEPTH_PIX_FMT: str = "gray12le"
+
+# Depth-specific tuning fields persisted under ``features[*]["info"]`` as ``video.<name>``.
+DEPTH_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset({"depth_min", "depth_max", "shift", "use_log"})
+

@dataclass
 class VideoEncoderConfig:
@@ -86,6 +100,10 @@ class VideoEncoderConfig:
    video_backend: str = "pyav"
    extra_options: dict[str, Any] = field(default_factory=dict)

+    # Source-data channel count this encoder is expected to handle (3 for RGB,
+    # 1 for depth, etc.)
+    _DEFAULT_CHANNELS: ClassVar[int] = 3
+
    def __post_init__(self) -> None:
        self.resolve_vcodec()
        # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work".
@@ -138,7 +156,9 @@ class VideoEncoderConfig:
            require_package("av", extra="dataset")
            from lerobot.datasets import check_video_encoder_parameters_pyav

-            check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options())
+            check_video_encoder_parameters_pyav(
+                self.vcodec, self.pix_fmt, self.get_codec_options(), channels=self._DEFAULT_CHANNELS
+            )

    def resolve_vcodec(self) -> None:
        """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder.
@@ -218,6 +238,10 @@ class VideoEncoderConfig:
        elif self.vcodec == "h264_qsv":
            set_if("global_quality", self.crf)
            set_if("preset", self.preset)
+        elif self.vcodec == "ffv1":
+            # Lossless intra-frame codec. ``crf``/``preset``/``fast_decode``
+            # are not meaningful.
+            set_if("threads", encoder_threads)
        else:
            set_if("crf", self.crf)
            set_if("preset", self.preset)
@@ -233,3 +257,59 @@ class VideoEncoderConfig:
 def camera_encoder_defaults() -> VideoEncoderConfig:
    """Return a :class:`VideoEncoderConfig` with RGB-camera defaults."""
    return VideoEncoderConfig()
+
+
+@dataclass
+class DepthEncoderConfig(VideoEncoderConfig):
+    """Encoder configuration for depth-map streams.
+
+    Inherits the full :class:`VideoEncoderConfig` surface (codec, GOP, CRF,
+    preset, ``extra_options``…) and adds the four parameters of the depth
+    quantizer.
+
+    Defaults flip ``vcodec`` to ``"hevc"`` (Main 12 profile) and ``pix_fmt``
+    to ``"gray12le"``.
+
+
+    Attributes:
+        depth_min: Minimum depth in physical units (e.g. metres) represented
+            by quantum ``0``.
+        depth_max: Maximum depth represented by quantum :data:`DEPTH_QMAX`.
+        shift: Pre-log offset for numerical stability near zero.
+        use_log: ``True`` for logarithmic quantization (default; matches
+            sensor error profile), ``False`` for linear.
+    """
+
+    vcodec: str = "hevc"
+    pix_fmt: str = "gray12le"
+
+    depth_min: float = DEFAULT_DEPTH_MIN
+    depth_max: float = DEFAULT_DEPTH_MAX
+    shift: float = DEFAULT_DEPTH_SHIFT
+    use_log: bool = DEFAULT_DEPTH_USE_LOG
+
+    _DEFAULT_CHANNELS: ClassVar[int] = 1
+
+    @classmethod
+    def from_video_info(cls, video_info: dict | None) -> DepthEncoderConfig:
+        """Reconstruct a :class:`DepthEncoderConfig` from a depth feature's ``info`` block.
+
+        Reuses :meth:`VideoEncoderConfig.from_video_info` for the base
+        codec/tuning fields and then layers the depth-specific tuning
+        (``depth_min`` / ``depth_max`` / ``shift`` / ``use_log``) on top.
+        Missing keys fall back to the class defaults.
+        """
+        base = VideoEncoderConfig.from_video_info(video_info)
+        kwargs: dict[str, Any] = {f.name: getattr(base, f.name) for f in fields(base) if f.init}
+
+        video_info = video_info or {}
+        for name in DEPTH_ENCODER_INFO_FIELD_NAMES:
+            value = video_info.get(f"video.{name}")
+            if value is not None:
+                kwargs[name] = value
+        return cls(**kwargs)
+
+
+def depth_encoder_defaults() -> DepthEncoderConfig:
+    """Return a :class:`DepthEncoderConfig` with depth-camera defaults."""
+    return DepthEncoderConfig()
@@ -550,8 +550,10 @@ def _validate_stat_value(value: np.ndarray, key: str, feature_key: str) -> None:
    if key == "count" and value.shape != (1,):
        raise ValueError(f"Shape of 'count' must be (1), but is {value.shape} instead.")

-    if "image" in feature_key and key != "count" and value.shape != (3, 1, 1):
-        raise ValueError(f"Shape of quantile '{key}' must be (3,1,1), but is {value.shape} instead.")
+    if "image" in feature_key and key != "count" and value.shape not in ((3, 1, 1), (1, 1, 1)):
+        raise ValueError(
+            f"Shape of quantile '{key}' must be (3,1,1) or (1,1,1) but is {value.shape} instead."
+        )


 def _assert_type_and_shape(stats_list: list[dict[str, dict]]):
@@ -338,6 +338,25 @@ class LeRobotDatasetMetadata:
        """Keys to access visual modalities stored as videos."""
        return [key for key, ft in self.features.items() if ft["dtype"] == "video"]

+    @property
+    def depth_keys(self) -> list[str]:
+        """Keys to access depth-map modalities stored as videos or images.
+
+        A depth key is a feature whose ``info`` dict carries ``"is_depth_map": True``
+        (or the legacy ``"video.is_depth_map"`` inside ``info`` or ``video_info``).
+        """
+
+        def _is_depth(ft: dict) -> bool:
+            info = ft.get("info") or {}
+            video_info = ft.get("video_info") or {}
+            return (
+                info.get("is_depth_map", False)
+                or info.get("video.is_depth_map", False)
+                or video_info.get("video.is_depth_map", False)
+            )
+
+        return [key for key, ft in self.features.items() if _is_depth(ft)]
+
    @property
    def camera_keys(self) -> list[str]:
        """Keys to access visual modalities (regardless of their storage method)."""
@@ -538,7 +557,7 @@ class LeRobotDatasetMetadata:
    def update_video_info(
        self,
        video_key: str | None = None,
-        camera_encoder: VideoEncoderConfig | None = None,
+        video_encoder: VideoEncoderConfig | None = None,
    ) -> None:
        """Populate per-feature video info in ``info.json``.

@@ -558,9 +577,13 @@ class LeRobotDatasetMetadata:

        video_keys = [video_key] if video_key is not None else self.video_keys
        for key in video_keys:
-            if not self.features[key].get("info", None):
-                video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
-                self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
+            existing = self.features[key].get("info") or {}
+            # Skip only if real video info has already been written. The ``is_depth_map`` entry (created at feature creation) is not blocking.
+            if set(existing.keys()) - {"is_depth_map"}:
+                continue
+            video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
+            new_info = get_video_info(video_path, video_encoder=video_encoder)
+            self.info.features[key]["info"] = {**existing, **new_info}

    def update_chunk_settings(
        self,
@@ -22,7 +22,10 @@ from pathlib import Path
 import datasets
 import torch

+from lerobot.configs.video import DepthEncoderConfig
+
 from .dataset_metadata import LeRobotDatasetMetadata
+from .depth_utils import dequantize_depth
 from .feature_utils import (
    check_delta_timestamps,
    get_delta_indices,
@@ -86,6 +89,12 @@ class DatasetReader:
            check_delta_timestamps(delta_timestamps, meta.fps, tolerance_s)
            self.delta_indices = get_delta_indices(delta_timestamps, meta.fps)

+        ##TODO(CarolinePascal): Should we rather use a more lightweight structure ?
+        self._depth_encoder_configs: dict[str, DepthEncoderConfig] = {
+            vid_key: DepthEncoderConfig.from_video_info(self._meta.features[vid_key].get("info"))
+            for vid_key in self._meta.depth_keys
+        }
+
    def try_load(self) -> bool:
        """Attempt to load from local cache. Returns True if data is sufficient."""
        try:
@@ -247,7 +256,18 @@ class DatasetReader:
                self._tolerance_s,
                self._video_backend,
                return_uint8=self._return_uint8,
+                is_depth=vid_key in self._meta.depth_keys,
            )
+            if vid_key in self._meta.depth_keys:
+                depth_encoder = self._depth_encoder_configs[vid_key]
+                frames = dequantize_depth(
+                    frames,
+                    depth_min=depth_encoder.depth_min,
+                    depth_max=depth_encoder.depth_max,
+                    shift=depth_encoder.shift,
+                    use_log=depth_encoder.use_log,
+                    output_tensor=True,
+                )
            return vid_key, frames.squeeze(0)

        items = list(query_timestamps.items())
@@ -1329,7 +1329,7 @@ def _estimate_frame_size_via_calibration(
            imgs_dir=calibration_dir,
            video_path=calibration_video_path,
            fps=fps,
-            camera_encoder=camera_encoder,
+            video_encoder=camera_encoder,
            overwrite=True,
        )

@@ -1813,7 +1813,7 @@ def convert_image_to_video_dataset(
                    imgs_dir=imgs_dir,
                    video_path=video_path,
                    fps=fps,
-                    camera_encoder=camera_encoder,
+                    video_encoder=camera_encoder,
                    overwrite=True,
                )

@@ -1860,7 +1860,7 @@ def convert_image_to_video_dataset(
                    video_key=img_key, chunk_index=0, file_index=0
                )
                new_meta.info.features[img_key]["info"] = get_video_info(
-                    video_path, camera_encoder=camera_encoder
+                    video_path, video_encoder=camera_encoder
                )

        write_info(new_meta.info, new_meta.root)
@@ -31,7 +31,12 @@ import PIL.Image
 import pyarrow.parquet as pq
 import torch

-from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
+from lerobot.configs import (
+    DepthEncoderConfig,
+    VideoEncoderConfig,
+    camera_encoder_defaults,
+    depth_encoder_defaults,
+)

 from .compute_stats import compute_episode_stats
 from .dataset_metadata import LeRobotDatasetMetadata
@@ -48,6 +53,7 @@ from .io_utils import (
    write_info,
 )
 from .utils import (
+    DEFAULT_DEPTH_PATH,
    DEFAULT_EPISODES_PATH,
    DEFAULT_IMAGE_PATH,
    update_chunk_file_indices,
@@ -67,17 +73,22 @@ def _encode_video_worker(
    episode_index: int,
    root: Path,
    fps: int,
-    camera_encoder: VideoEncoderConfig | None = None,
+    video_encoder: VideoEncoderConfig | None = None,
    encoder_threads: int | None = None,
 ) -> Path:
    temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
-    fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
+    path_template = (
+        DEFAULT_DEPTH_PATH
+        if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig)
+        else DEFAULT_IMAGE_PATH
+    )
+    fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0)
    img_dir = (root / fpath).parent
    encode_video_frames(
        img_dir,
        temp_path,
        fps,
-        camera_encoder=camera_encoder,
+        video_encoder=video_encoder,
        encoder_threads=encoder_threads,
        overwrite=True,
    )
@@ -97,6 +108,7 @@ class DatasetWriter:
        meta: LeRobotDatasetMetadata,
        root: Path,
        camera_encoder: VideoEncoderConfig | None,
+        depth_encoder: DepthEncoderConfig | None,
        encoder_threads: int | None,
        batch_encoding_size: int,
        streaming_encoder: StreamingVideoEncoder | None = None,
@@ -110,6 +122,8 @@ class DatasetWriter:
            root: Local dataset root directory.
            camera_encoder: Video encoder settings applied to all cameras.
                ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
+            depth_encoder: Video encoder settings applied to all **depth** cameras.
+                ``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`.
            encoder_threads: Number of encoder threads (global). ``None``
                lets the codec decide.
            batch_encoding_size: Number of episodes to accumulate before
@@ -121,6 +135,7 @@ class DatasetWriter:
        self._meta = meta
        self._root = root
        self._camera_encoder = camera_encoder or camera_encoder_defaults()
+        self._depth_encoder = depth_encoder or depth_encoder_defaults()
        self._encoder_threads = encoder_threads
        self._batch_encoding_size = batch_encoding_size
        self._streaming_encoder = streaming_encoder
@@ -145,7 +160,8 @@ class DatasetWriter:
        return ep_buffer

    def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path:
-        fpath = DEFAULT_IMAGE_PATH.format(
+        path_template = DEFAULT_DEPTH_PATH if image_key in self._meta.depth_keys else DEFAULT_IMAGE_PATH
+        fpath = path_template.format(
            image_key=image_key, episode_index=episode_index, frame_index=frame_index
        )
        return self._root / fpath
@@ -195,6 +211,7 @@ class DatasetWriter:
        if frame_index == 0 and self._streaming_encoder is not None:
            self._streaming_encoder.start_episode(
                video_keys=list(self._meta.video_keys),
+                depth_video_keys=set(self._meta.video_keys) & set(self._meta.depth_keys),
                temp_dir=self._root,
            )

@@ -293,7 +310,9 @@ class DatasetWriter:
                            episode_index,
                            self._root,
                            self._meta.fps,
-                            self._camera_encoder,
+                            self._depth_encoder
+                            if video_key in self._meta.depth_keys
+                            else self._camera_encoder,
                            self._encoder_threads,
                        ): video_key
                        for video_key in self._meta.video_keys
@@ -504,7 +523,12 @@ class DatasetWriter:

        # Update video info (only needed when first episode is encoded)
        if episode_index == 0:
-            self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder)
+            self._meta.update_video_info(
+                video_key,
+                video_encoder=self._depth_encoder
+                if video_key in self._meta.depth_keys
+                else self._camera_encoder,
+            )
            write_info(self._meta.info, self._meta.root)

        metadata = {
@@ -571,13 +595,14 @@ class DatasetWriter:
            self.image_writer.wait_until_done()

    def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
-        """Use ffmpeg to convert frames stored as png into mp4 videos."""
+        """Use ffmpeg to convert frames stored as png/tiff into mp4 videos."""
+        is_depth = video_key in self._meta.depth_keys
        return _encode_video_worker(
            video_key,
            episode_index,
            self._root,
            self._meta.fps,
-            self._camera_encoder,
+            self._depth_encoder if is_depth else self._camera_encoder,
            self._encoder_threads,
        )

@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth encoding/decoding helpers for :class:`VideoEncoderConfig`.
+"""
+
+import math
+from typing import Literal
+
+import av
+import numpy as np
+import torch
+from numpy.typing import NDArray
+
+from lerobot.configs.video import (
+    DEFAULT_DEPTH_MAX,
+    DEFAULT_DEPTH_MIN,
+    DEFAULT_DEPTH_PIX_FMT,
+    DEFAULT_DEPTH_SHIFT,
+    DEFAULT_DEPTH_USE_LOG,
+    DEPTH_QMAX,
+)
+
+from .pyav_utils import write_u16_plane
+
+_MM_PER_METRE = 1000.0
+_UINT16_MAX = 65535
+
+
+def _validate_log_quant_params(depth_min: float, shift: float) -> None:
+    """Ensure ``log(depth_min + shift)`` is finite."""
+    if depth_min + shift <= 0:
+        raise ValueError(
+            f"depth_min + shift must be positive for logarithmic quantization, "
+            f"got depth_min={depth_min} + shift={shift} = {depth_min + shift}"
+        )
+
+
+def _depth_input_to_float32_and_unit(
+    depth: NDArray[np.integer] | NDArray[np.floating],
+    input_unit: Literal["auto", "m", "mm"],
+) -> tuple[NDArray[np.float32], Literal["m", "mm"]]:
+    """Convert depth to float32 in the chosen unit, and return the resolved unit."""
+    resolved_unit = (
+        ("m" if np.issubdtype(depth.dtype, np.floating) else "mm") if input_unit == "auto" else input_unit
+    )
+    return depth.astype(np.float32, order="K"), resolved_unit
+
+
+def quantize_depth(
+    depth: NDArray[np.uint16] | NDArray[np.float32] | torch.Tensor,
+    depth_min: float = DEFAULT_DEPTH_MIN,
+    depth_max: float = DEFAULT_DEPTH_MAX,
+    shift: float = DEFAULT_DEPTH_SHIFT,
+    use_log: bool = DEFAULT_DEPTH_USE_LOG,
+    pix_fmt: str = DEFAULT_DEPTH_PIX_FMT,
+    video_backend: str | None = "pyav",
+    input_unit: Literal["auto", "m", "mm"] = "auto",
+) -> NDArray[np.uint16] | av.VideoFrame:
+    """Quantize depth to 12-bit codes (``uint16``, values ``0…DEPTH_QMAX``).
+
+    Depth maps are packed into 12-bit integer frames so they fit in standard
+    high-bit-depth pixel formats (e.g. ``yuv420p12le`` / ``gray12le``)
+    and can be encoded by widely supported video codecs (HEVC Main 12, ffv1).
+    Logarithmic quantization is the default because it allocates more quanta
+    to near-range depth, which matches the (1/depth) error profile of typical
+    depth sensors. Math is ported from BEHAVIOR-1K's ``obs_utils.py``.
+
+    **Input units**:
+
+    - ``input_unit="auto"`` (default): infer from dtype (floating = m, non-floating = mm).
+    - ``input_unit="mm"``: interpret input values as millimetres.
+    - ``input_unit="m"``: interpret input values as metres.
+
+    Quantization math runs in the **resolved input unit**.
+
+    ``depth_min``, ``depth_max``, and ``shift`` are always in **metres**.
+
+    Args:
+        depth: Depth map; ``torch.Tensor`` is moved to CPU for conversion.
+        depth_min: Depth (metres) at quantum ``0``.
+        depth_max: Depth (metres) at quantum :data:`DEPTH_QMAX`.
+        shift: Depth shift (metres); used in log mode. Must satisfy ``depth_min + shift > 0``.
+        use_log: If ``True`` (default), quantize in log space.
+        video_backend: Video backend to use for encoding. Defaults to "pyav".
+        input_unit: Input unit policy (``"auto"``, ``"mm"``, ``"m"``).
+
+    Returns:
+        ``numpy.ndarray``, ``dtype=uint16``, same shape as ``depth``, values in
+        ``[0, DEPTH_QMAX]``.
+
+    Raises:
+        ValueError: If ``input_unit`` is not ``"auto"``, ``"mm"``, or ``"m"``.
+        ValueError: If ``use_log=True`` and ``depth_min + shift <= 0``.
+    """
+    if input_unit not in ("auto", "m", "mm"):
+        raise ValueError(f"input_unit must be 'auto', 'm', or 'mm', got {input_unit!r}")
+
+    if isinstance(depth, torch.Tensor):
+        depth = depth.detach().cpu().numpy()
+
+    # Squeeze single-channel dim: (H, W, 1) or (1, H, W) → (H, W)
+    if depth.ndim == 3 and (depth.shape[-1] == 1 or depth.shape[0] == 1):
+        depth = depth.squeeze()
+
+    depth_f, resolved_unit = _depth_input_to_float32_and_unit(depth, input_unit=input_unit)
+
+    # Convert depth_min, depth_max, and shift to the resolved input unit.
+    depth_min_u = np.float32(depth_min) if resolved_unit == "m" else np.float32(depth_min * _MM_PER_METRE)
+    depth_max_u = np.float32(depth_max) if resolved_unit == "m" else np.float32(depth_max * _MM_PER_METRE)
+    shift_u = np.float32(shift) if resolved_unit == "m" else np.float32(shift * _MM_PER_METRE)
+
+    # Normalization and quantization is performed in the resolved input unit.
+    if use_log:
+        _validate_log_quant_params(depth_min, shift)
+        log_min = math.log(float(depth_min_u + shift_u))
+        log_max = math.log(float(depth_max_u + shift_u))
+        norm = (np.log(depth_f + shift_u) - log_min) / (log_max - log_min)
+    else:
+        norm = (depth_f - depth_min_u) / (depth_max_u - depth_min_u)
+
+    quantized = np.rint(norm * DEPTH_QMAX).clip(0, DEPTH_QMAX).astype(np.uint16, copy=False)
+
+    if video_backend == "pyav":
+        frame = av.VideoFrame.from_ndarray(quantized, format=pix_fmt)
+        write_u16_plane(frame.planes[0], quantized)
+        return frame
+    else:
+        return quantized
+
+
+def dequantize_depth(
+    quantized: NDArray[np.uint16] | av.VideoFrame,
+    depth_min: float = DEFAULT_DEPTH_MIN,
+    depth_max: float = DEFAULT_DEPTH_MAX,
+    shift: float = DEFAULT_DEPTH_SHIFT,
+    use_log: bool = DEFAULT_DEPTH_USE_LOG,
+    pix_fmt: str = DEFAULT_DEPTH_PIX_FMT,
+    output_unit: Literal["m", "mm"] = "mm",
+    output_tensor: bool = False,
+) -> NDArray[np.uint16] | NDArray[np.float32] | torch.Tensor:
+    """Inverse of :func:`quantize_depth`.
+
+    Tuning arguments **must match** :func:`quantize_depth`.
+
+    Decoding inverts the same normalized code mapping as :func:`quantize_depth`
+    using ``depth_min`` / ``depth_max`` / ``shift`` (in metres), then returns
+    the requested output unit.
+
+    Args:
+        quantized: 12-bit codes ``[0, DEPTH_QMAX]``, ``dtype=uint16``.
+        depth_min, depth_max, shift, use_log: Same as :func:`quantize_depth` (metres).
+        output_unit: ``\"mm\"`` returns ``uint16`` millimetres (``rint``, clip
+            ``[0, 65535]``). ``\"m\"`` returns ``float32`` metres in
+            ``[depth_min, depth_max]``.
+        output_tensor: If True, return a torch.Tensor instead of a numpy array.
+
+    Returns:
+        Depth map in the requested unit and dtype.
+
+    Raises:
+        ValueError: If ``use_log=True`` and ``depth_min + shift <= 0``.
+        ValueError: If ``output_unit`` is not ``\"m\"`` or ``\"mm\"``.
+    """
+    if output_unit not in ("m", "mm"):
+        raise ValueError(f"output_unit must be 'm' or 'mm', got {output_unit!r}")
+
+    if isinstance(quantized, av.VideoFrame):
+        quantized = quantized.to_ndarray(format=pix_fmt)
+
+    norm = np.asarray(quantized, dtype=np.float32, order="K") / DEPTH_QMAX
+
+    depth_min_m = np.float32(depth_min)
+    depth_max_m = np.float32(depth_max)
+    shift_m = np.float32(shift)
+
+    # The de-normalization and de-quantization is performed in meters (convenience choice).
+    if use_log:
+        _validate_log_quant_params(depth_min, shift)
+        log_min = math.log(float(depth_min_m + shift_m))
+        log_max = math.log(float(depth_max_m + shift_m))
+        depth_m = np.exp(norm * (log_max - log_min) + log_min) - shift_m
+    else:
+        depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m
+    depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False)
+
+    # Add single-channel dim: (H, W) → (H, W, 1)
+    if depth_m.ndim == 2:
+        depth_m = depth_m[..., np.newaxis]
+
+    # Return depth as float32 meters.
+    if output_unit == "m":
+        return torch.from_numpy(depth_m) if output_tensor else depth_m
+
+    # Return depth as uint16 millimeters.
+    mm = np.rint(depth_m * _MM_PER_METRE).clip(0, _UINT16_MAX).astype(np.uint16, copy=False)
+    if output_tensor:
+        # torch.uint16 support is very limited, we convert to float32 instead.
+        return torch.from_numpy(mm.astype(np.float32))
+    else:
+        return mm
@@ -321,7 +321,7 @@ def validate_feature_image_or_video(

    Args:
        name (str): The name of the feature.
-        expected_shape (list[str]): The expected shape (C, H, W).
+        expected_shape (list[str]): The expected shape, e.g. (C, H, W) or (H, W, C).
        value: The image data to validate.

    Returns:
@@ -42,10 +42,41 @@ def safe_stop_image_writer(func):


 def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image:
-    # TODO(aliberts): handle 1 channel and 4 for depth images
-    if image_array.ndim != 3:
-        raise ValueError(f"The array has {image_array.ndim} dimensions, but 3 is expected for an image.")
+    """Convert a NumPy array to a PIL Image, preserving precision for grayscale.

+    Behaviour by shape:
+
+    - ``(H, W)`` or ``(1, H, W)`` / ``(H, W, 1)``: single-channel grayscale.
+      The native dtype is preserved using the matching PIL mode
+      (``I;16`` / ``F``). This is the path used for raw depth maps (no rescaling, clamping, or downcasting)
+    - ``(3, H, W)`` / ``(H, W, 3)``: RGB. Channels-first inputs are transposed
+      to channels-last. Float inputs in ``[0, 1]`` are scaled to ``uint8``
+      (existing behaviour, gated by ``range_check``).
+
+    Other shapes / channel counts raise ``NotImplementedError`` or
+    ``ValueError``.
+    """
+    # TODO(CarolinePascal): 4 dimensions RGB-D images
+    if image_array.ndim not in (2, 3):
+        raise ValueError(f"The array has {image_array.ndim} dimensions, but 2 or 3 is expected for an image.")
+
+    # Squeeze 3D single-channel inputs to 2D so depth maps work whether the
+    # caller emits (H, W), (1, H, W), or (H, W, 1).
+    if image_array.ndim == 3:
+        if image_array.shape[0] == 1:
+            image_array = image_array[0]
+        elif image_array.shape[-1] == 1:
+            image_array = image_array[..., 0]
+
+    if image_array.ndim == 2:
+        if image_array.dtype not in [np.uint16, np.float32]:
+            raise ValueError(
+                f"Unsupported single-channel image dtype: {image_array.dtype}. "
+                f"Supported dtypes: {sorted(str(d) for d in [np.uint16, np.float32])}."
+            )
+        return PIL.Image.fromarray(np.ascontiguousarray(image_array))
+
+    # 3D path: must be RGB (3 channels), channels-first or channels-last.
    if image_array.shape[0] == 3:
        # Transpose from pytorch convention (C, H, W) to (H, W, C)
        image_array = image_array.transpose(1, 2, 0)
@@ -71,13 +102,28 @@ def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True)
    return PIL.Image.fromarray(image_array)


+def save_kwargs_for_path(fpath: Path, compress_level: int) -> dict:
+    """Pick the right format-specific kwargs for :meth:`PIL.Image.Image.save`.
+
+    PNG uses ``compress_level`` (0-9, zlib). TIFF uses ``compression`` (raw) for lossless raw depth maps.
+    """
+    suffix = Path(fpath).suffix.lower()
+    if suffix == ".png":
+        return {"compress_level": compress_level}
+    if suffix in (".tif", ".tiff"):
+        return {"compression": "raw"}
+    return {}
+
+
 def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level: int = 1):
    """
    Saves a NumPy array or PIL Image to a file.

    This function handles both NumPy arrays and PIL Image objects, converting
    the former to a PIL Image before saving. It includes error handling for
-    the save operation.
+    the save operation. The output format is inferred from the *fpath*
+    extension: ``.png`` → PNG with ``compress_level``, ``.tiff`` / ``.tif``
+    → lossless raw depth maps (TIFF).

    Args:
        image (np.ndarray | PIL.Image.Image): The image data to save.
@@ -101,7 +147,7 @@ def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level
            img = image
        else:
            raise TypeError(f"Unsupported image type: {type(image)}")
-        img.save(fpath, compress_level=compress_level)
+        img.save(fpath, **save_kwargs_for_path(fpath, compress_level))
    except Exception as e:
        logger.error("Error writing image %s: %s", fpath, e)

@@ -24,7 +24,7 @@ import torch.utils
 from huggingface_hub import HfApi, snapshot_download
 from huggingface_hub.errors import RevisionNotFoundError

-from lerobot.configs import VideoEncoderConfig
+from lerobot.configs import DepthEncoderConfig, VideoEncoderConfig
 from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE

 from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -60,6 +60,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return_uint8: bool = False,
        batch_encoding_size: int = 1,
        camera_encoder: VideoEncoderConfig | None = None,
+        depth_encoder: DepthEncoderConfig | None = None,
        encoder_threads: int | None = None,
        streaming_encoding: bool = False,
        encoder_queue_maxsize: int = 30,
@@ -186,6 +187,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
            camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
                (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
                is used by the writer.
+            depth_encoder (DepthEncoderConfig | None, optional): Video encoder settings for depth cameras
+                (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults`
+                is used by the writer.
            encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
                codec decide.
            streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
@@ -273,6 +277,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                streaming_enc = self._build_streaming_encoder(
                    self.meta.fps,
                    camera_encoder,
+                    depth_encoder,
                    encoder_queue_maxsize,
                    encoder_threads,
                )
@@ -280,6 +285,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                meta=self.meta,
                root=self.root,
                camera_encoder=camera_encoder,
+                depth_encoder=depth_encoder,
                encoder_threads=encoder_threads,
                batch_encoding_size=batch_encoding_size,
                streaming_encoder=streaming_enc,
@@ -322,12 +328,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
    def _build_streaming_encoder(
        fps: int,
        camera_encoder: VideoEncoderConfig | None,
+        depth_encoder: DepthEncoderConfig | None,
        encoder_queue_maxsize: int,
        encoder_threads: int | None,
    ) -> StreamingVideoEncoder:
        return StreamingVideoEncoder(
            fps=fps,
            camera_encoder=camera_encoder,
+            depth_encoder=depth_encoder,
            queue_maxsize=encoder_queue_maxsize,
            encoder_threads=encoder_threads,
        )
@@ -645,6 +653,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        video_backend: str | None = None,
        batch_encoding_size: int = 1,
        camera_encoder: VideoEncoderConfig | None = None,
+        depth_encoder: DepthEncoderConfig | None = None,
        metadata_buffer_size: int = 10,
        streaming_encoding: bool = False,
        encoder_queue_maxsize: int = 30,
@@ -677,6 +686,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
                batch-encoding videos. ``1`` means encode immediately.
            camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
                When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+            depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.).
+                When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` is used.
            encoder_threads: Number of encoder threads (global). ``None``
                lets the codec decide.
            metadata_buffer_size: Number of episode metadata records to buffer
@@ -720,12 +731,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
        streaming_enc = None
        if streaming_encoding and len(obj.meta.video_keys) > 0:
            streaming_enc = cls._build_streaming_encoder(
-                fps, camera_encoder, encoder_queue_maxsize, encoder_threads
+                fps, camera_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads
            )
        obj.writer = DatasetWriter(
            meta=obj.meta,
            root=obj.root,
            camera_encoder=camera_encoder,
+            depth_encoder=depth_encoder,
            encoder_threads=encoder_threads,
            batch_encoding_size=batch_encoding_size,
            streaming_encoder=streaming_enc,
@@ -749,6 +761,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        video_backend: str | None = None,
        batch_encoding_size: int = 1,
        camera_encoder: VideoEncoderConfig | None = None,
+        depth_encoder: DepthEncoderConfig | None = None,
        encoder_threads: int | None = None,
        image_writer_processes: int = 0,
        image_writer_threads: int = 0,
@@ -778,6 +791,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
                batch-encoding videos.
            camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
                When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+            depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.).
+                When ``None``, :func:`~lerobot.configs.depth.depth_encoder_defaults` is used.
            encoder_threads: Number of encoder threads (global). ``None``
                lets the codec decide.
            image_writer_processes: Subprocesses for async image writing.
@@ -824,12 +839,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
        streaming_enc = None
        if streaming_encoding and len(obj.meta.video_keys) > 0:
            streaming_enc = cls._build_streaming_encoder(
-                obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads
+                obj.meta.fps, camera_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads
            )
        obj.writer = DatasetWriter(
            meta=obj.meta,
            root=obj.root,
            camera_encoder=camera_encoder,
+            depth_encoder=depth_encoder,
            encoder_threads=encoder_threads,
            batch_encoding_size=batch_encoding_size,
            streaming_encoder=streaming_enc,
@@ -24,6 +24,7 @@ import logging
 from typing import Any

 import av
+import numpy as np

 logger = logging.getLogger(__name__)

@@ -31,6 +32,22 @@ FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE")
 FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64")


+def write_u16_plane(plane: av.video.plane.VideoPlane, src: np.ndarray, fill_value: int | None = None) -> None:
+    """Copy ``src`` into a uint16 plane respecting FFmpeg line padding."""
+    height, width = src.shape
+    stride_u16 = plane.line_size // np.dtype(np.uint16).itemsize
+    dst = np.frombuffer(plane, dtype=np.uint16).reshape(height, stride_u16)
+    if fill_value is not None:
+        dst.fill(fill_value)
+    dst[:, :width] = src
+
+
+@functools.cache
+def get_pix_fmt_channels(pix_fmt: str) -> int:
+    """Return the number of components (channels) for *pix_fmt*."""
+    return len(av.VideoFormat(pix_fmt).components)
+
+
@functools.cache
 def get_codec(vcodec: str) -> av.codec.Codec | None:
    """PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable."""
@@ -142,6 +159,16 @@ def _check_pixel_format(vcodec: str, pix_fmt: str) -> None:
        )


+def _check_pix_fmt_channels(pix_fmt: str, channels: int) -> None:
+    """Ensure *pix_fmt* can carry at least *channels* components."""
+    pix_fmt_channels = get_pix_fmt_channels(pix_fmt)
+    if pix_fmt_channels < channels:
+        raise ValueError(
+            f"pix_fmt={pix_fmt!r} carries only {pix_fmt_channels} component(s) "
+            f"but the source data has {channels} channel(s)."
+        )
+
+
 def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None:
    """Validate merged encoder options (typed) against the codec's published AVOptions."""
    supported_options = _get_codec_options_by_name(vcodec)
@@ -156,12 +183,18 @@ def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None:
        _check_option_value(vcodec, key, value, supported_options[key])


-def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options: dict[str, Any]) -> None:
+def check_video_encoder_parameters_pyav(
+    vcodec: str,
+    pix_fmt: str,
+    codec_options: dict[str, Any],
+    channels: int | None = None,
+) -> None:
    """Verify *config* is compatible with the bundled FFmpeg build.

    Checks pixel format, abstract tuning-field compatibility, and each merged
    encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options`
    against PyAV (including numeric ``extra_options`` present in that dict).
+    When given, additionally verify that *pix_fmt* carries as many components as the source data channels.
    No-op when ``config.vcodec`` isn't in the local FFmpeg build.

    Raises:
@@ -171,4 +204,6 @@ def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options
    if not options:
        raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build")
    _check_pixel_format(vcodec, pix_fmt)
+    if channels is not None:
+        _check_pix_fmt_channels(pix_fmt, channels)
    _check_codec_options(vcodec, codec_options)
@@ -93,6 +93,7 @@ DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
 DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
 DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
 DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
+DEFAULT_DEPTH_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.tiff"

 LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
 LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
@@ -37,11 +37,16 @@ from datasets.features.features import register_feature
 from PIL import Image

 from lerobot.configs import (
+    DepthEncoderConfig,
    VideoEncoderConfig,
    camera_encoder_defaults,
+    depth_encoder_defaults,
 )
 from lerobot.utils.import_utils import get_safe_default_video_backend

+from .depth_utils import quantize_depth
+from .pyav_utils import get_pix_fmt_channels
+
 logger = logging.getLogger(__name__)


@@ -51,6 +56,7 @@ def decode_video_frames(
    tolerance_s: float,
    backend: str | None = None,
    return_uint8: bool = False,
+    is_depth: bool = False,
 ) -> torch.Tensor:
    """
    Decodes video frames using the specified backend.
@@ -70,6 +76,11 @@ def decode_video_frames(

    Currently supports torchcodec on cpu and pyav.
    """
+    if backend != "pyav" and is_depth:
+        logger.warning("Decoding depth maps is only supported with the 'pyav' backend.")
+        # We do not actually return uint8 here, but we avoid the 255 normalization step.
+        return decode_video_frames_pyav(video_path, timestamps, tolerance_s, return_uint8=True, is_depth=True)
+
    if backend is None:
        backend = get_safe_default_video_backend()
    if backend == "torchcodec":
@@ -89,6 +100,7 @@ def decode_video_frames_pyav(
    tolerance_s: float,
    log_loaded_timestamps: bool = False,
    return_uint8: bool = False,
+    is_depth: bool = False,
 ) -> torch.Tensor:
    """Loads frames associated to the requested timestamps of a video using PyAV.

@@ -138,9 +150,13 @@ def decode_video_frames_pyav(
            current_ts = float(frame.pts * stream.time_base)
            if log_loaded_timestamps:
                logger.info(f"frame loaded at timestamp={current_ts:.4f}")
-            # Convert to CHW uint8 to match torchcodec's output layout.
-            arr = frame.to_ndarray(format="rgb24")  # H, W, 3
-            loaded_frames.append(torch.from_numpy(arr).permute(2, 0, 1).contiguous())
+            if is_depth:
+                arr = frame.to_ndarray(format="gray12le")  # (H, W) uint12
+                loaded_frames.append(torch.from_numpy(arr).unsqueeze(0).contiguous())
+            else:
+                arr = frame.to_ndarray(format="rgb24")  # (H, W, 3)
+                # Convert to CHW uint8 to match torchcodec's output layout.
+                loaded_frames.append(torch.from_numpy(arr).permute(2, 0, 1).contiguous())
            loaded_ts.append(current_ts)
            if current_ts >= last_ts:
                break
@@ -335,17 +351,17 @@ def encode_video_frames(
    imgs_dir: Path | str,
    video_path: Path | str,
    fps: int,
-    camera_encoder: VideoEncoderConfig | None = None,
+    video_encoder: VideoEncoderConfig | None = None,
    encoder_threads: int | None = None,
    *,
    log_level: int | None = av.logging.WARNING,
    overwrite: bool = False,
 ) -> None:
    """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
-    if camera_encoder is None:
-        camera_encoder = camera_encoder_defaults()
-    vcodec = camera_encoder.vcodec
-    pix_fmt = camera_encoder.pix_fmt
+    if video_encoder is None:
+        video_encoder = camera_encoder_defaults()
+    vcodec = video_encoder.vcodec
+    pix_fmt = video_encoder.pix_fmt

    video_path = Path(video_path)
    imgs_dir = Path(imgs_dir)
@@ -357,7 +373,8 @@ def encode_video_frames(
    video_path.parent.mkdir(parents=True, exist_ok=True)

    # Get input frames
-    template = "frame-" + ("[0-9]" * 6) + ".png"
+    suffix = ".png" if not isinstance(video_encoder, DepthEncoderConfig) else ".tiff"
+    template = "frame-" + ("[0-9]" * 6) + suffix
    input_list = sorted(
        glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0])
    )
@@ -367,7 +384,7 @@ def encode_video_frames(
    with Image.open(input_list[0]) as dummy_image:
        width, height = dummy_image.size

-    video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True)
+    video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True)

    # Set logging level
    if log_level is not None:
@@ -519,22 +536,21 @@ class _CameraEncoderThread(threading.Thread):
        self,
        video_path: Path,
        fps: int,
-        vcodec: str,
-        pix_fmt: str,
-        codec_options: dict[str, str],
+        video_encoder: VideoEncoderConfig,
        frame_queue: queue.Queue,
        result_queue: queue.Queue,
        stop_event: threading.Event,
+        encoder_threads: int | None = None,
    ):
        super().__init__(daemon=True)
        self.video_path = video_path
        self.fps = fps
-        self.vcodec = vcodec
-        self.pix_fmt = pix_fmt
-        self.codec_options = codec_options
+        self.video_encoder = video_encoder
+        self.is_depth = isinstance(video_encoder, DepthEncoderConfig)
        self.frame_queue = frame_queue
        self.result_queue = result_queue
        self.stop_event = stop_event
+        self.encoder_threads = encoder_threads

    def run(self) -> None:
        from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -559,12 +575,12 @@ class _CameraEncoderThread(threading.Thread):
                    # Sentinel: flush and close
                    break

-                # Ensure HWC uint8 numpy array
+                # Ensure HWC (RGB or depth) uint8 (RGB only) numpy array
                if isinstance(frame_data, np.ndarray):
-                    if frame_data.ndim == 3 and frame_data.shape[0] == 3:
+                    if frame_data.ndim == 3 and frame_data.shape[0] in (1, 3):
                        # CHW -> HWC
                        frame_data = frame_data.transpose(1, 2, 0)
-                    if frame_data.dtype != np.uint8:
+                    if not self.is_depth and frame_data.dtype != np.uint8:
                        frame_data = (frame_data * 255).astype(np.uint8)

                # Open container on first frame (to get width/height)
@@ -572,15 +588,29 @@ class _CameraEncoderThread(threading.Thread):
                    height, width = frame_data.shape[:2]
                    Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
                    container = av.open(str(self.video_path), "w")
-                    output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options)
-                    output_stream.pix_fmt = self.pix_fmt
+                    output_stream = container.add_stream(
+                        self.video_encoder.vcodec,
+                        self.fps,
+                        options=self.video_encoder.get_codec_options(self.encoder_threads, as_strings=True),
+                    )
+                    output_stream.pix_fmt = self.video_encoder.pix_fmt
                    output_stream.width = width
                    output_stream.height = height
                    output_stream.time_base = Fraction(1, self.fps)

                # Encode frame with explicit timestamps
-                pil_img = Image.fromarray(frame_data)
-                video_frame = av.VideoFrame.from_image(pil_img)
+                if not self.is_depth:
+                    pil_img = Image.fromarray(frame_data)
+                    video_frame = av.VideoFrame.from_image(pil_img)
+                else:
+                    video_frame = quantize_depth(
+                        frame_data,
+                        depth_min=self.video_encoder.depth_min,
+                        depth_max=self.video_encoder.depth_max,
+                        shift=self.video_encoder.shift,
+                        use_log=self.video_encoder.use_log,
+                        video_backend=self.video_encoder.video_backend,
+                    )
                video_frame.pts = frame_count
                video_frame.time_base = Fraction(1, self.fps)
                packet = output_stream.encode(video_frame)
@@ -639,6 +669,7 @@ class StreamingVideoEncoder:
        self,
        fps: int,
        camera_encoder: VideoEncoderConfig | None = None,
+        depth_encoder: DepthEncoderConfig | None = None,
        queue_maxsize: int = 30,
        encoder_threads: int | None = None,
    ):
@@ -654,6 +685,7 @@ class StreamingVideoEncoder:
        """
        self.fps = fps
        self._camera_encoder = camera_encoder or camera_encoder_defaults()
+        self._depth_encoder = depth_encoder or depth_encoder_defaults()
        self._encoder_threads = encoder_threads
        self.queue_maxsize = queue_maxsize

@@ -666,18 +698,25 @@ class StreamingVideoEncoder:
        self._episode_active = False
        self._closed = False

-    def start_episode(self, video_keys: list[str], temp_dir: Path) -> None:
+    def start_episode(
+        self, video_keys: list[str], temp_dir: Path, depth_video_keys: list[str] | None = None
+    ) -> None:
        """Start encoder threads for a new episode.

        Args:
            video_keys: List of video feature keys (e.g. ["observation.images.laptop"])
            temp_dir: Base directory for temporary MP4 files
+            depth_video_keys: List of video feature keys that carry depth maps (e.g.
+                ["observation.images.laptop_depth"]).  Defaults to ``[]`` (no depth keys).
        """
        if self._episode_active:
            self.cancel_episode()

        self._dropped_frames.clear()

+        if depth_video_keys is None:
+            depth_video_keys = []
+
        for video_key in video_keys:
            frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize)
            result_queue: queue.Queue = queue.Queue(maxsize=1)
@@ -686,17 +725,15 @@ class StreamingVideoEncoder:
            temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
            video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"

-            vcodec = self._camera_encoder.vcodec
-            codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True)
+            encoder = self._depth_encoder if video_key in depth_video_keys else self._camera_encoder
            encoder_thread = _CameraEncoderThread(
                video_path=video_path,
                fps=self.fps,
-                vcodec=vcodec,
-                pix_fmt=self._camera_encoder.pix_fmt,
-                codec_options=codec_options,
+                video_encoder=encoder,
                frame_queue=frame_queue,
                result_queue=result_queue,
                stop_event=stop_event,
+                encoder_threads=self._encoder_threads,
            )
            encoder_thread.start()

@@ -903,13 +940,13 @@ def get_audio_info(video_path: Path | str) -> dict:

 def get_video_info(
    video_path: Path | str,
-    camera_encoder: VideoEncoderConfig | None = None,
+    video_encoder: VideoEncoderConfig | None = None,
 ) -> dict:
    """Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``.

    Args:
        video_path: Path to the encoded video file to probe.
-        camera_encoder: If provided, record the exact encoder settings used to encode this
+        video_encoder: If provided, record the exact encoder settings used to encode this
            video. Stream-derived values take precedence — encoder fields are only written for keys
            not already populated from the video file itself.
    """
@@ -929,13 +966,10 @@ def get_video_info(
        video_info["video.width"] = video_stream.width
        video_info["video.codec"] = video_stream.codec.canonical_name
        video_info["video.pix_fmt"] = video_stream.pix_fmt
-        video_info["video.is_depth_map"] = False

        # Calculate fps from r_frame_rate
        video_info["video.fps"] = int(video_stream.base_rate)
-
-        pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
-        video_info["video.channels"] = pixel_channels
+        video_info["video.channels"] = get_pix_fmt_channels(video_stream.pix_fmt)

    # Reset logging level
    av.logging.restore_default_callback()
@@ -944,27 +978,18 @@ def get_video_info(
    video_info.update(**get_audio_info(video_path))

    # Add additional encoder configuration if provided
-    if camera_encoder is not None:
-        for field_name, field_value in asdict(camera_encoder).items():
+    if video_encoder is not None:
+        for field_name, field_value in asdict(video_encoder).items():
            # vcodec is already populated from the video stream
            if field_name == "vcodec":
                continue
            video_info.setdefault(f"video.{field_name}", field_value)

+    video_info["is_depth_map"] = isinstance(video_encoder, DepthEncoderConfig)
+
    return video_info


-def get_video_pixel_channels(pix_fmt: str) -> int:
-    if "gray" in pix_fmt or "depth" in pix_fmt or "monochrome" in pix_fmt:
-        return 1
-    elif "rgba" in pix_fmt or "yuva" in pix_fmt:
-        return 4
-    elif "rgb" in pix_fmt or "yuv" in pix_fmt:
-        return 3
-    else:
-        raise ValueError("Unknown format")
-
-
 def get_video_duration_in_s(video_path: Path | str) -> float:
    """
    Get the duration of a video file in seconds using PyAV.
@@ -68,9 +68,12 @@ class SOFollower(Robot):

    @property
    def _cameras_ft(self) -> dict[str, tuple]:
-        return {
-            cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras
-        }
+        features: dict[str, tuple] = {}
+        for cam in self.cameras:
+            features[cam] = (self.cameras[cam].height, self.cameras[cam].width, 3)
+            if getattr(self.cameras[cam], "use_depth", False):
+                features[f"{cam}_depth"] = (self.cameras[cam].height, self.cameras[cam].width, 1)
+        return features

    @cached_property
    def observation_features(self) -> dict[str, type | tuple]:
@@ -190,6 +193,12 @@ class SOFollower(Robot):
            dt_ms = (time.perf_counter() - start) * 1e3
            logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms")

+            if getattr(cam, "use_depth", False):
+                start = time.perf_counter()
+                obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth()
+                dt_ms = (time.perf_counter() - start) * 1e3
+                logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms")
+
        return obs_dict

    @check_if_not_connected
@@ -333,6 +333,7 @@ def build_rollout_context(
                root=cfg.dataset.root,
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                encoder_threads=cfg.dataset.encoder_threads,
@@ -368,6 +369,7 @@ def build_rollout_context(
                * len(robot.cameras if hasattr(robot, "cameras") else []),
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                encoder_threads=cfg.dataset.encoder_threads,
@@ -399,6 +399,7 @@ def record(
                root=cfg.dataset.root,
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                encoder_threads=cfg.dataset.encoder_threads,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
@@ -428,6 +429,7 @@ def record(
                image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                camera_encoder=cfg.dataset.camera_encoder,
+                depth_encoder=cfg.dataset.depth_encoder,
                encoder_threads=cfg.dataset.encoder_threads,
                streaming_encoding=cfg.dataset.streaming_encoding,
                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
@@ -69,6 +69,7 @@ def hw_to_dataset_features(
        for key, ftype in hw_features.items()
        if ftype is float or (isinstance(ftype, PolicyFeature) and ftype.type != FeatureType.VISUAL)
    }
+    # TODO(CarolinePascal): we should not rely on the shape to determine if a feature is a camera !
    cam_fts = {key: shape for key, shape in hw_features.items() if isinstance(shape, tuple)}

    if joint_fts and prefix == ACTION:
@@ -86,11 +87,19 @@ def hw_to_dataset_features(
        }

    for key, shape in cam_fts.items():
-        features[f"{prefix}.images.{key}"] = {
-            "dtype": "video" if use_video else "image",
-            "shape": shape,
-            "names": ["height", "width", "channels"],
-        }
+        dtype = "video" if use_video else "image"
+        if len(shape) == 3 and shape[2] in (1, 3):
+            features[f"{prefix}.images.{key}"] = {
+                "dtype": dtype,
+                "shape": shape,
+                "names": ["height", "width", "channels"],
+                "info": {"is_depth_map": shape[2] == 1},
+            }
+        else:
+            raise ValueError(
+                f"Camera feature '{key}' has shape {shape}. "
+                f"Expected a 3-tuple (H, W, C), e.g. (480, 640, 3) for RGB or (480, 640, 1) for depth."
+            )

    _validate_feature_names(features)
    return features
@@ -149,11 +158,11 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
            type = FeatureType.VISUAL
            if len(shape) != 3:
                raise ValueError(f"Number of dimensions of {key} != 3 (shape={shape})")
-
-            names = ft["names"]
-            # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets.
-            if names[2] in ["channel", "channels"]:  # (h, w, c) -> (c, h, w)
-                shape = (shape[2], shape[0], shape[1])
+            else:
+                names = ft["names"]
+                # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets.
+                if names[2] in ["channel", "channels"]:  # (h, w, c) -> (c, h, w)
+                    shape = (shape[2], shape[0], shape[1])
        elif key == OBS_ENV_STATE:
            type = FeatureType.ENV
        elif key.startswith(OBS_STR):
@@ -107,8 +107,15 @@ def log_rerun_data(
                    for i, vi in enumerate(arr):
                        rr.log(f"{key}_{i}", rr.Scalars(float(vi)))
                else:
-                    img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr)
-                    rr.log(key, entity=img_entity, static=True)
+                    if arr.shape[-1] == 1:
+                        img_entity = (
+                            rr.DepthImage(arr, colormap=rr.components.Colormap.Viridis).compress()
+                            if compress_images
+                            else rr.DepthImage(arr, colormap=rr.components.Colormap.Viridis)
+                        )
+                    else:
+                        img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr)
+                    rr.log(key, entity=img_entity)

    if action:
        for k, v in action.items():
@@ -59,11 +59,13 @@ def _make_dummy_stats(features: dict) -> dict:
    stats = {}
    for key, ft in features.items():
        if ft["dtype"] in ("image", "video"):
+            channels = ft["shape"][-1]
+            stat_shape = (channels, 1, 1)
            stats[key] = {
-                "max": np.ones((3, 1, 1), dtype=np.float32),
-                "mean": np.full((3, 1, 1), 0.5, dtype=np.float32),
-                "min": np.zeros((3, 1, 1), dtype=np.float32),
-                "std": np.full((3, 1, 1), 0.25, dtype=np.float32),
+                "max": np.ones(stat_shape, dtype=np.float32),
+                "mean": np.full(stat_shape, 0.5, dtype=np.float32),
+                "min": np.zeros(stat_shape, dtype=np.float32),
+                "std": np.full(stat_shape, 0.25, dtype=np.float32),
                "count": np.array([5]),
            }
        elif ft["dtype"] in ("float32", "float64", "int64"):
@@ -142,6 +144,45 @@ def test_create_without_videos_has_no_video_path(tmp_path):
    assert meta.video_keys == []


+@pytest.mark.parametrize(
+    ("marker_field", "marker_key"),
+    [
+        ("info", "is_depth_map"),
+        ("info", "video.is_depth_map"),
+        ("video_info", "video.is_depth_map"),
+    ],
+    ids=["info.is_depth_map", "info.video.is_depth_map_legacy", "video_info.video.is_depth_map_legacy"],
+)
+def test_depth_keys_property_filters_by_marker(tmp_path, marker_field, marker_key):
+    """``depth_keys`` recognises the canonical and the two legacy marker variants."""
+    depth_feature = {
+        "dtype": "video",
+        "shape": (64, 96, 1),
+        "names": ["height", "width", "channels"],
+        marker_field: {marker_key: True},
+    }
+    features = {
+        **VIDEO_FEATURES,
+        "observation.images.laptop_depth": depth_feature,
+    }
+    meta = LeRobotDatasetMetadata.create(
+        repo_id="test/depth_keys",
+        fps=DEFAULT_FPS,
+        features=features,
+        root=tmp_path / f"depth_keys_{marker_field}_{marker_key.replace('.', '_')}",
+    )
+
+    assert set(meta.video_keys) == {"observation.images.laptop", "observation.images.laptop_depth"}
+    assert meta.depth_keys == ["observation.images.laptop_depth"]
+
+
+def test_depth_keys_empty_when_no_marker(tmp_path):
+    meta = LeRobotDatasetMetadata.create(
+        repo_id="test/no_depth", fps=DEFAULT_FPS, features=VIDEO_FEATURES, root=tmp_path / "no_depth"
+    )
+    assert meta.depth_keys == []
+
+
 def test_create_raises_on_existing_directory(tmp_path):
    """create() raises if root directory already exists."""
    root = tmp_path / "existing"
@@ -53,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
 # ── Existing encode_video_worker tests ───────────────────────────────


-def test_encode_video_worker_forwards_camera_encoder(tmp_path):
-    """_encode_video_worker forwards camera_encoder to encode_video_frames."""
+def test_encode_video_worker_forwards_video_encoder(tmp_path):
+    """_encode_video_worker forwards video_encoder to encode_video_frames."""
    video_key = "observation.images.laptop"
    fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
    img_dir = tmp_path / Path(fpath).parent
@@ -74,16 +74,16 @@ def test_encode_video_worker_forwards_camera_encoder(tmp_path):
            0,
            tmp_path,
            fps=30,
-            camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
+            video_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
            encoder_threads=4,
        )

-    assert captured_kwargs["camera_encoder"].vcodec == "h264"
+    assert captured_kwargs["video_encoder"].vcodec == "h264"
    assert captured_kwargs["encoder_threads"] == 4


-def test_encode_video_worker_default_camera_encoder(tmp_path):
-    """_encode_video_worker passes None camera_encoder which encode_video_frames defaults."""
+def test_encode_video_worker_default_video_encoder(tmp_path):
+    """_encode_video_worker passes None video_encoder which encode_video_frames defaults."""
    video_key = "observation.images.laptop"
    fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
    img_dir = tmp_path / Path(fpath).parent
@@ -100,7 +100,7 @@ def test_encode_video_worker_default_camera_encoder(tmp_path):
    with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
        _encode_video_worker(video_key, 0, tmp_path, fps=30)

-    assert captured_kwargs["camera_encoder"] is None
+    assert captured_kwargs["video_encoder"] is None
    assert captured_kwargs["encoder_threads"] is None


@@ -1480,10 +1480,15 @@ def test_valid_video_codecs_constant():
    assert "h264" in VALID_VIDEO_CODECS
    assert "hevc" in VALID_VIDEO_CODECS
    assert "libsvtav1" in VALID_VIDEO_CODECS
+    assert "ffv1" in VALID_VIDEO_CODECS
    assert "auto" in VALID_VIDEO_CODECS
    assert "h264_videotoolbox" in VALID_VIDEO_CODECS
    assert "h264_nvenc" in VALID_VIDEO_CODECS
-    assert len(VALID_VIDEO_CODECS) == 10
+    assert "h264_vaapi" in VALID_VIDEO_CODECS
+    assert "h264_qsv" in VALID_VIDEO_CODECS
+    assert "hevc_videotoolbox" in VALID_VIDEO_CODECS
+    assert "hevc_nvenc" in VALID_VIDEO_CODECS
+    assert len(VALID_VIDEO_CODECS) == 11    


 def test_delta_timestamps_with_episodes_filter(tmp_path, empty_lerobot_dataset_factory):
@@ -0,0 +1,307 @@
+"""Tests for the depth-integration feature.
+
+Covers quantization/dequantization round-trips (depth_utils), image writer
+depth support (image_writer), hardware→dataset feature routing
+(feature_utils), video info helpers (video_utils / configs.video), and
+feature-to-file-format routing through the dataset writer.
+
+Depth metadata detection on ``LeRobotDatasetMetadata.depth_keys`` (canonical
+and legacy marker variants) lives in ``test_dataset_metadata.py``.
+"""
+
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import pytest
+import torch
+
+pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
+
+import av
+
+from lerobot.configs import DepthEncoderConfig
+from lerobot.configs.video import DEPTH_QMAX, VALID_VIDEO_CODECS
+from lerobot.datasets.depth_utils import dequantize_depth, quantize_depth
+from lerobot.datasets.image_writer import (
+    image_array_to_pil_image,
+    save_kwargs_for_path,
+    write_image,
+)
+from lerobot.datasets.pyav_utils import get_pix_fmt_channels
+from tests.fixtures.constants import (
+    DEFAULT_FPS,
+    DUMMY_CAMERA_FEATURES,
+    DUMMY_DEPTH_CAMERA_FEATURES,
+    DUMMY_MOTOR_FEATURES,
+    DUMMY_REPO_ID,
+)
+
+H, W = 48, 64
+DEPTH_MIN = 0.01
+DEPTH_MAX = 10.0
+
+
+# ── 1. Quantize / Dequantize round-trips ────────────────────────────
+
+
+class TestQuantizeDequantize:
+    """Core numerical tests for depth_utils.quantize_depth / dequantize_depth."""
+
+    def _make_depth_metres(self) -> np.ndarray:
+        """Linearly-spaced float32 depth in metres covering the default range."""
+        return np.linspace(DEPTH_MIN, DEPTH_MAX, H * W, dtype=np.float32).reshape(H, W)
+
+    def test_roundtrip_linear_metres(self):
+        depth = self._make_depth_metres()
+        quantized = quantize_depth(depth, use_log=False, video_backend=None)
+        recovered = dequantize_depth(quantized, use_log=False, output_unit="m")
+
+        assert recovered.shape == (H, W, 1), f"Expected (H,W,1), got {recovered.shape}"
+        assert recovered.dtype == np.float32
+        tol = (DEPTH_MAX - DEPTH_MIN) / DEPTH_QMAX
+        np.testing.assert_allclose(recovered[..., 0], depth, atol=tol + 1e-6)
+
+    def test_roundtrip_log_metres(self):
+        depth = self._make_depth_metres()
+        quantized = quantize_depth(depth, use_log=True, video_backend=None)
+        recovered = dequantize_depth(quantized, use_log=True, output_unit="m")
+
+        assert recovered.shape == (H, W, 1)
+        near = depth < 1.0
+        far = depth > 8.0
+        err_near = np.abs(recovered[..., 0][near] - depth[near])
+        err_far = np.abs(recovered[..., 0][far] - depth[far])
+        assert err_near.mean() < err_far.mean(), "Log quant should be more precise at close range"
+
+    def test_roundtrip_mm_uint16_input(self):
+        depth_mm = np.linspace(10, 10000, H * W, dtype=np.float64).reshape(H, W).astype(np.uint16)
+        quantized = quantize_depth(depth_mm, use_log=False, video_backend=None, input_unit="mm")
+        recovered = dequantize_depth(quantized, use_log=False, output_unit="mm")
+
+        assert recovered.dtype == np.uint16
+        tol_mm = (DEPTH_MAX - DEPTH_MIN) * 1000.0 / DEPTH_QMAX
+        np.testing.assert_allclose(
+            recovered[..., 0].astype(np.float64), depth_mm.astype(np.float64), atol=tol_mm + 1.0
+        )
+
+    def test_quantize_clamps_out_of_range(self):
+        depth = np.array([[0.001, 99.0]], dtype=np.float32)
+        quantized = quantize_depth(depth, use_log=False, video_backend=None)
+        assert quantized[0, 0] == 0
+        assert quantized[0, 1] == DEPTH_QMAX
+
+    def test_quantize_accepts_torch_tensor(self):
+        t = torch.rand(H, W, dtype=torch.float32) * (DEPTH_MAX - DEPTH_MIN) + DEPTH_MIN
+        result = quantize_depth(t, video_backend=None)
+        assert isinstance(result, np.ndarray)
+        assert result.dtype == np.uint16
+
+    def test_quantize_squeezes_channel_dim(self):
+        depth = self._make_depth_metres()
+        for shape in [(H, W, 1), (1, H, W)]:
+            reshaped = depth.reshape(shape)
+            quantized = quantize_depth(reshaped, video_backend=None)
+            assert quantized.ndim == 2, f"Input shape {shape} should be squeezed to 2D"
+
+    def test_quantize_returns_pyav_frame(self):
+        depth = self._make_depth_metres()
+        result = quantize_depth(depth, video_backend="pyav")
+        assert isinstance(result, av.VideoFrame)
+
+    def test_dequantize_output_tensor(self):
+        quantized = np.full((H, W), DEPTH_QMAX // 2, dtype=np.uint16)
+        result = dequantize_depth(quantized, output_unit="m", output_tensor=True)
+        assert isinstance(result, torch.Tensor)
+        assert result.shape == (H, W, 1)
+
+    def test_invalid_log_params_raises(self):
+        depth = np.ones((4, 4), dtype=np.float32)
+        with pytest.raises(ValueError, match="depth_min \\+ shift must be positive"):
+            quantize_depth(depth, depth_min=1.0, shift=-2.0, use_log=True, video_backend=None)
+
+
+# ── 2. Image writer depth support ───────────────────────────────────
+
+
+class TestImageWriterDepth:
+    """image_array_to_pil_image and write_image for single-channel depth maps."""
+
+    def test_pil_uint16_grayscale(self):
+        arr = np.arange(H * W, dtype=np.uint16).reshape(H, W)
+        img = image_array_to_pil_image(arr)
+        assert isinstance(img, PIL.Image.Image)
+        assert img.mode == "I;16"
+        assert img.size == (W, H)
+
+    def test_pil_float32_grayscale(self):
+        arr = np.random.rand(H, W).astype(np.float32)
+        img = image_array_to_pil_image(arr)
+        assert img.mode == "F"
+
+    def test_pil_squeeze_hwc1_and_1hw(self):
+        arr_uint16 = np.zeros((H, W), dtype=np.uint16)
+        for input_arr in [arr_uint16.reshape(H, W, 1), arr_uint16.reshape(1, H, W)]:
+            img = image_array_to_pil_image(input_arr)
+            assert img.size == (W, H)
+
+    def test_save_kwargs_png_vs_tiff(self):
+        png_kw = save_kwargs_for_path(Path("frame.png"), compress_level=5)
+        assert png_kw == {"compress_level": 5}
+
+        tiff_kw = save_kwargs_for_path(Path("frame.tiff"), compress_level=5)
+        assert tiff_kw == {"compression": "raw"}
+
+        assert save_kwargs_for_path(Path("frame.jpg"), compress_level=5) == {}
+
+    def test_write_image_tiff_roundtrip(self, tmp_path):
+        arr = np.arange(H * W, dtype=np.uint16).reshape(H, W)
+        fpath = tmp_path / "depth.tiff"
+        write_image(arr, fpath)
+
+        assert fpath.exists()
+        with PIL.Image.open(fpath) as loaded:
+            recovered = np.array(loaded)
+        np.testing.assert_array_equal(recovered, arr)
+
+
+# ── 3. Feature routing ──────────────────────────────────────────────
+
+
+class TestHwToDatasetFeaturesDepth:
+    """hw_to_dataset_features marks single-channel cameras as depth."""
+
+    def test_single_channel_cam_marked_depth(self):
+        from lerobot.utils.feature_utils import hw_to_dataset_features
+
+        features = hw_to_dataset_features({"cam": (480, 640, 1)}, prefix="observation")
+        ft = features["observation.images.cam"]
+        assert ft["info"]["is_depth_map"] is True
+
+    def test_three_channel_cam_not_depth(self):
+        from lerobot.utils.feature_utils import hw_to_dataset_features
+
+        features = hw_to_dataset_features({"cam": (480, 640, 3)}, prefix="observation")
+        ft = features["observation.images.cam"]
+        assert ft["info"]["is_depth_map"] is False
+
+    def test_invalid_channel_count_raises(self):
+        from lerobot.utils.feature_utils import hw_to_dataset_features
+
+        with pytest.raises(ValueError, match="Expected a 3-tuple"):
+            hw_to_dataset_features({"cam": (480, 640, 2)}, prefix="observation")
+
+
+# ── 4. Video info depth flag ────────────────────────────────────────
+
+
+class TestVideoInfoDepthFlag:
+    """Misc depth-related constants and helpers in video_utils / configs."""
+
+    def test_get_pix_fmt_channels_gray(self):
+        assert get_pix_fmt_channels("gray12le") == 1
+        assert get_pix_fmt_channels("gray8") == 1
+
+    def test_ffv1_in_valid_codecs(self):
+        assert "ffv1" in VALID_VIDEO_CODECS
+
+
+# ── 5. Feature-to-file-format routing ───────────────────────────────
+
+
+def _build_mixed_features(dtype: str) -> dict:
+    """Build a feature dict with one RGB camera and one depth camera.
+
+    Uses shapes from ``DUMMY_CAMERA_FEATURES`` and ``DUMMY_DEPTH_CAMERA_FEATURES``
+    defined in ``tests.fixtures.constants``.
+    """
+    rgb_cam = next(iter(DUMMY_CAMERA_FEATURES.values()))
+    depth_cam = next(iter(DUMMY_DEPTH_CAMERA_FEATURES.values()))
+    return {
+        "observation.images.rgb": {"dtype": dtype, **rgb_cam},
+        "observation.images.depth": {"dtype": dtype, **depth_cam},
+        **{k: {"dtype": v["dtype"], **v} for k, v in DUMMY_MOTOR_FEATURES.items()},
+    }
+
+
+def _make_mixed_frame(features: dict) -> dict:
+    """Build a valid frame dict matching the given feature schema."""
+    frame: dict = {"task": "test task"}
+    for key, ft in features.items():
+        shape = ft["shape"]
+        if ft["dtype"] in ("image", "video"):
+            channels = shape[-1]
+            if channels == 1:
+                frame[key] = np.random.randint(0, 4095, shape, dtype=np.uint16)
+            else:
+                frame[key] = np.random.randint(0, 255, shape, dtype=np.uint8)
+        else:
+            frame[key] = np.random.randn(*shape).astype(ft["dtype"])
+    return frame
+
+
+class TestFeatureFileRouting:
+    """Verify that depth vs RGB features are routed to the correct file format."""
+
+    NUM_FRAMES = 5
+
+    def test_no_video_depth_tiff_rgb_png(self, tmp_path):
+        """Without video encoding: depth -> .tiff, RGB -> .png."""
+        from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+        features = _build_mixed_features(dtype="image")
+
+        dataset = LeRobotDataset.create(
+            repo_id=DUMMY_REPO_ID,
+            fps=DEFAULT_FPS,
+            features=features,
+            root=tmp_path / "ds",
+            use_videos=False,
+        )
+
+        for _ in range(self.NUM_FRAMES):
+            dataset.add_frame(_make_mixed_frame(features))
+
+        buf = dataset.writer.episode_buffer
+        depth_paths = [Path(p) for p in buf["observation.images.depth"]]
+        rgb_paths = [Path(p) for p in buf["observation.images.rgb"]]
+
+        assert all(p.suffix == ".tiff" for p in depth_paths), "Depth frames should be .tiff"
+        assert all(p.suffix == ".png" for p in rgb_paths), "RGB frames should be .png"
+        assert all(p.exists() for p in depth_paths), "Depth TIFF files should exist on disk"
+        assert all(p.exists() for p in rgb_paths), "RGB PNG files should exist on disk"
+
+        dataset.save_episode()
+        dataset.finalize()
+
+    def test_video_depth_uses_depth_encoder(self, tmp_path):
+        """With streaming video encoding: depth keys use DepthEncoderConfig, RGB keys do not."""
+        from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+        features = _build_mixed_features(dtype="video")
+
+        dataset = LeRobotDataset.create(
+            repo_id=DUMMY_REPO_ID,
+            fps=DEFAULT_FPS,
+            features=features,
+            root=tmp_path / "ds",
+            use_videos=True,
+            streaming_encoding=True,
+        )
+
+        assert dataset.writer._streaming_encoder is not None
+        encoder = dataset.writer._streaming_encoder
+
+        for _ in range(self.NUM_FRAMES):
+            dataset.add_frame(_make_mixed_frame(features))
+
+        rgb_thread = encoder._threads["observation.images.rgb"]
+        depth_thread = encoder._threads["observation.images.depth"]
+
+        assert not isinstance(rgb_thread.video_encoder, DepthEncoderConfig)
+        assert isinstance(depth_thread.video_encoder, DepthEncoderConfig)
+        assert depth_thread.is_depth is True
+        assert rgb_thread.is_depth is False
+
+        dataset.save_episode()
+        dataset.finalize()
@@ -94,7 +94,7 @@ def test_image_array_to_pil_image_pytorch_format(img_array_factory):

 def test_image_array_to_pil_image_single_channel(img_array_factory):
    img_array = img_array_factory(channels=1)
-    with pytest.raises(NotImplementedError):
+    with pytest.raises(ValueError, match="Unsupported single-channel image dtype"):
        image_array_to_pil_image(img_array)


@@ -61,9 +61,7 @@ class TestCameraEncoderThread:
        encoder_thread = _CameraEncoderThread(
            video_path=video_path,
            fps=fps,
-            vcodec=enc_cfg.vcodec,
-            pix_fmt=enc_cfg.pix_fmt,
-            codec_options=enc_cfg.get_codec_options(as_strings=True),
+            video_encoder=enc_cfg,
            frame_queue=frame_queue,
            result_queue=result_queue,
            stop_event=stop_event,
@@ -112,9 +110,7 @@ class TestCameraEncoderThread:
        encoder_thread = _CameraEncoderThread(
            video_path=video_path,
            fps=fps,
-            vcodec=enc_cfg.vcodec,
-            pix_fmt=enc_cfg.pix_fmt,
-            codec_options=enc_cfg.get_codec_options(as_strings=True),
+            video_encoder=enc_cfg,
            frame_queue=frame_queue,
            result_queue=result_queue,
            stop_event=stop_event,
@@ -146,9 +142,7 @@ class TestCameraEncoderThread:
        encoder_thread = _CameraEncoderThread(
            video_path=video_path,
            fps=fps,
-            vcodec=enc_cfg.vcodec,
-            pix_fmt=enc_cfg.pix_fmt,
-            codec_options=enc_cfg.get_codec_options(as_strings=True),
+            video_encoder=enc_cfg,
            frame_queue=frame_queue,
            result_queue=result_queue,
            stop_event=stop_event,
@@ -391,7 +385,8 @@ class TestStreamingVideoEncoder:

        # Verify codec options include thread tuning for libsvtav1 (lp=…)
        thread = encoder._threads[f"{OBS_IMAGES}.cam"]
-        assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options
+        codec_opts = thread.video_encoder.get_codec_options(encoder_threads=thread.encoder_threads)
+        assert "svtav1-params" in codec_opts or "threads" in codec_opts

        # Feed some frames and finish to ensure it works end-to-end
        num_frames = 10
@@ -26,7 +26,7 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])")

 import av  # noqa: E402

-from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig
+from lerobot.configs import VALID_VIDEO_CODECS, DepthEncoderConfig, VideoEncoderConfig
 from lerobot.datasets.image_writer import write_image
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.datasets.pyav_utils import get_codec
@@ -338,7 +338,7 @@ def _encode_video(
 ) -> Path:
    imgs_dir = path.parent / f"imgs_{path.stem}"
    _write_frames(imgs_dir, num_frames=num_frames)
-    encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True)
+    encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True)
    return path


@@ -368,7 +368,7 @@ class TestGetVideoInfo:
        assert info["video.pix_fmt"] == "yuv420p"
        assert info["video.fps"] == 30
        assert info["video.channels"] == 3
-        assert info["video.is_depth_map"] is False
+        assert info["is_depth_map"] is False
        assert info["has_audio"] is False
        assert "video.g" not in info
        assert "video.crf" not in info
@@ -378,7 +378,7 @@ class TestGetVideoInfo:
    def test_merges_encoder_config_as_video_prefixed_entries(self):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)

-        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=cfg)

        assert info["video.g"] == 2
        assert info["video.crf"] == 30
@@ -391,11 +391,16 @@ class TestGetVideoInfo:
    def test_stream_derived_keys_take_precedence_over_config(self):
        cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p")

-        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=cfg)

        assert info["video.codec"]  # populated from stream, not from config's vcodec
        assert info["video.pix_fmt"] == "yuv420p"

+    def test_depth_encoder_config_sets_is_depth_map_true(self):
+        """A ``DepthEncoderConfig`` causes ``get_video_info`` to mark the stream as depth."""
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=DepthEncoderConfig())
+        assert info["is_depth_map"] is True
+

 class TestEncodeVideoFrames:
    @require_libsvtav1
@@ -454,7 +459,7 @@ class TestEncodeVideoFrames:
        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10)
        video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg)

-        info = get_video_info(video_path, camera_encoder=cfg)
+        info = get_video_info(video_path, video_encoder=cfg)

        # Stream-derived
        assert info["video.height"] == 64
@@ -463,7 +468,7 @@ class TestEncodeVideoFrames:
        assert info["video.codec"] == "av1"
        assert info["video.pix_fmt"] == "yuv420p"
        assert info["video.fps"] == 30
-        assert info["video.is_depth_map"] is False
+        assert info["is_depth_map"] is False
        assert info["has_audio"] is False
        # Encoder config
        assert info["video.g"] == 4
@@ -39,12 +39,23 @@ DUMMY_VIDEO_INFO = {
    "video.crf": 30,
    "video.preset": 12,
    "video.fast_decode": 0,
-    "video.is_depth_map": False,
+    "is_depth_map": False,
    "has_audio": False,
 }
 DUMMY_CAMERA_FEATURES = {
    "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
    "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
 }
+DUMMY_DEPTH_VIDEO_INFO = {
+    **DUMMY_VIDEO_INFO,
+    "is_depth_map": True,
+}
+DUMMY_DEPTH_CAMERA_FEATURES = {
+    "laptop_depth": {
+        "shape": (64, 96, 1),
+        "names": ["height", "width", "channels"],
+        "info": DUMMY_DEPTH_VIDEO_INFO,
+    },
+}
 DUMMY_CHW = (3, 96, 128)
 DUMMY_HWC = (96, 128, 3)
Author	SHA1	Message	Date
Caroline Pascal	3ab08a5318	fix(imports): fixing av import in test_depth.py	2026-05-22 15:13:15 +02:00
CarolinePascal	5e53e6bd2f	tests(typos): fixing typos in tests	2026-05-22 13:09:56 +02:00
CarolinePascal	a94d9f119c	fix(info): fixing info metadata update when is_depth_map was set	2026-05-22 02:48:30 +02:00
CarolinePascal	8a615070e7	fix(pre-commit): fixing mutable defautl value	2026-05-22 02:07:33 +02:00
CarolinePascal	8e56797287	feat(refactor): refactor DepthEncoderConfig quantization pipeline, so that the methods do not live in the config class. Add pixel format - channels validation.Move the default pixel format for depth in the config file.	2026-05-22 02:06:37 +02:00
CarolinePascal	7498f1cf61	feat(pix_fmt channels): use PyAv to check get pixel formats number of channels	2026-05-22 02:03:23 +02:00
CarolinePascal	72a429764a	tests(depth): adding new tests for depth integration validation	2026-05-21 20:20:40 +02:00
CarolinePascal	4ea8653ca3	test(fix): fixing exisiting tests to still work with latest features	2026-05-21 19:56:00 +02:00
CarolinePascal	eeabb4d258	chore(typos): fixing typos	2026-05-21 19:55:33 +02:00
CarolinePascal	2b8d7b3c06	fix(plumbing): fixing missing parts in the depth maps pipeline	2026-05-21 16:11:01 +02:00
CarolinePascal	4a49f4a391	fix(stop_event): fixing stop_event race condition in camera classes	2026-05-21 15:51:12 +02:00
CarolinePascal	15647f50a2	feat(is_depth): simplifying is_depth nested name + legacy support	2026-05-21 14:26:16 +02:00
CarolinePascal	e87933302d	feat(depth shape): ensuring depth maps shape is always including the channel	2026-05-21 14:25:42 +02:00
CarolinePascal	3cf5e3c8cb	chore(format): format code	2026-05-20 16:47:22 +02:00
CarolinePascal	33a3b5a982	feat(depth maps writer): adding support for raw depth maps recording with image writer	2026-05-20 16:42:16 +02:00
CarolinePascal	1dafb4acf6	feat(viz): render depth observations as rr.DepthImage in Viridis	2026-05-20 16:22:34 +02:00
CarolinePascal	14df709201	feat(record): plumb DepthEncoderConfig through lerobot-record	2026-05-20 16:14:14 +02:00
CarolinePascal	d6f97ae17f	feat(robots/so_follower): emit + populate depth keys when use_depth	2026-05-20 16:09:53 +02:00
CarolinePascal	085f574301	feat(features): route 2D camera shapes to observation.depth.<key>	2026-05-20 15:50:46 +02:00
CarolinePascal	f15348e769	feat(cameras/realsense): expose async depth in metric meters	2026-05-20 15:24:47 +02:00
CarolinePascal	e51d45dd2c	feat(depth): wire DatasetReader to decode_depth_frames	2026-05-19 23:46:28 +02:00
CarolinePascal	d39698da0f	feat(depth): wire StreamingVideoEncoder + writer to depth encoder	2026-05-19 23:23:27 +02:00
CarolinePascal	b4c31f0f67	feat(depth): plumb DepthEncoderConfig through LeRobotDataset and DatasetWriter	2026-05-19 22:50:19 +02:00
CarolinePascal	0cc5162078	feat(depth): extend quantization tools to better fit the encoding/decoding pipeline	2026-05-19 17:10:47 +02:00
CarolinePascal	b960524d93	feat(depth): persist depth metadata	2026-05-19 16:13:14 +02:00
CarolinePascal	088352383d	feat(video): add ffv1 to supported codecs	2026-05-19 16:13:01 +02:00
CarolinePascal	42214d1c7a	feat(depth): add depth quantization helpers and tests	2026-05-18 18:09:37 +02:00