From 15647f50a2b6b07093f74a6c944c40ed4b7df601 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Thu, 21 May 2026 14:26:16 +0200
Subject: [PATCH] feat(is_depth): simplifying is_depth nested name + legacy
 support

---
 docs/source/video_encoding_parameters.mdx |  4 +--
 src/lerobot/datasets/dataset_metadata.py  | 20 +++++++++-----
 src/lerobot/datasets/dataset_writer.py    |  2 +-
 src/lerobot/datasets/video_utils.py       |  3 +--
 src/lerobot/utils/feature_utils.py        | 32 +++++++++--------------
 tests/datasets/test_dataset_metadata.py   |  8 +++---
 tests/datasets/test_video_encoding.py     |  4 +--
 tests/fixtures/constants.py               |  2 +-
 8 files changed, 37 insertions(+), 38 deletions(-)
diff --git a/docs/source/video_encoding_parameters.mdx b/docs/source/video_encoding_parameters.mdx
index 0b5b99b2b..9665a6b91 100644
--- a/docs/source/video_encoding_parameters.mdx
+++ b/docs/source/video_encoding_parameters.mdx
@@ -82,7 +82,7 @@ After the first episode of a video stream is encoded, the encoder configuration
         "video.pix_fmt": "yuv420p",
         "video.fps": 30,
         "video.channels": 3,
-        "video.is_depth_map": false,
+        "is_depth_map": false,
         "video.g": 2,
         "video.crf": 30,
         "video.preset": "fast",
@@ -97,7 +97,7 @@ After the first episode of a video stream is encoded, the encoder configuration
 
 Two sources contribute to the `info` block:
 
-- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present.
+- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `is_depth_map`, plus `audio.*` if an audio stream is present.
 - **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`.
 
 <Tip>
diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py
index feaa6d543..a5ff9718d 100644
--- a/src/lerobot/datasets/dataset_metadata.py
+++ b/src/lerobot/datasets/dataset_metadata.py
@@ -342,14 +342,20 @@ class LeRobotDatasetMetadata:
     def depth_keys(self) -> list[str]:
         """Keys to access depth-map modalities stored as videos or images.
 
-        A depth key is a feature whose ``info`` dict carries ``"<dtype>.is_depth_map": True``.
+        A depth key is a feature whose ``info`` dict carries ``"is_depth_map": True``
+        (or the legacy ``"video.is_depth_map"`` inside ``info`` or ``video_info``).
         """
-        return [
-            key
-            for key, ft in self.features.items()
-            # TODO(CarolinePascal): Make sure the legacy video_info works here as well.
-            if (ft.get("info") or {}).get(ft["dtype"] + ".is_depth_map", False)
-        ]
+
+        def _is_depth(ft: dict) -> bool:
+            info = ft.get("info") or {}
+            video_info = ft.get("video_info") or {}
+            return (
+                info.get("is_depth_map", False)
+                or info.get("video.is_depth_map", False)
+                or video_info.get("video.is_depth_map", False)
+            )
+
+        return [key for key, ft in self.features.items() if _is_depth(ft)]
 
     @property
     def camera_keys(self) -> list[str]:
diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py
index 5a497f30f..a79d0b57c 100644
--- a/src/lerobot/datasets/dataset_writer.py
+++ b/src/lerobot/datasets/dataset_writer.py
@@ -155,7 +155,7 @@ class DatasetWriter:
         return ep_buffer
 
     def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path:
-        path_template = DEFAULT_DEPTH_PATH if self.image_key in self._meta.depth_keys else DEFAULT_IMAGE_PATH
+        path_template = DEFAULT_DEPTH_PATH if image_key in self._meta.depth_keys else DEFAULT_IMAGE_PATH
         fpath = path_template.format(
             image_key=image_key, episode_index=episode_index, frame_index=frame_index
         )
diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py
index 1a1233cbc..3951e69d0 100644
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -955,7 +955,6 @@ def get_video_info(
         video_info["video.width"] = video_stream.width
         video_info["video.codec"] = video_stream.codec.canonical_name
         video_info["video.pix_fmt"] = video_stream.pix_fmt
-        video_info["video.is_depth_map"] = False
 
         # Calculate fps from r_frame_rate
         video_info["video.fps"] = int(video_stream.base_rate)
@@ -976,7 +975,7 @@ def get_video_info(
             if field_name == "vcodec":
                 continue
             video_info.setdefault(f"video.{field_name}", field_value)
-        video_info["video.is_depth_map"] = isinstance(video_encoder, DepthEncoderConfig)
+        video_info["is_depth_map"] = isinstance(video_encoder, DepthEncoderConfig)
 
     return video_info
 
diff --git a/src/lerobot/utils/feature_utils.py b/src/lerobot/utils/feature_utils.py
index 85a43f142..44f3877e4 100644
--- a/src/lerobot/utils/feature_utils.py
+++ b/src/lerobot/utils/feature_utils.py
@@ -88,21 +88,18 @@ def hw_to_dataset_features(
 
     for key, shape in cam_fts.items():
         dtype = "video" if use_video else "image"
-        if len(shape) == 2 or shape[2] == 1:
-            if len(shape) == 2:
-                shape = (shape[0], shape[1], 1)
-            features[f"{prefix}.depth_maps.{key}"] = {
-                "dtype": dtype,
-                "shape": shape,
-                "names": ["height", "width", "channels"],
-                "info": {dtype + ".is_depth_map": True},
-            }
-        else:
+        if len(shape) == 3 and shape[2] in (1, 3):
             features[f"{prefix}.images.{key}"] = {
                 "dtype": dtype,
                 "shape": shape,
                 "names": ["height", "width", "channels"],
+                "info": {"is_depth_map": shape[2] == 1},
             }
+        else:
+            raise ValueError(
+                f"Camera feature '{key}' has shape {shape}. "
+                f"Expected a 3-tuple (H, W, C), e.g. (480, 640, 3) for RGB or (480, 640, 1) for depth."
+            )
 
     _validate_feature_names(features)
     return features
@@ -132,10 +129,7 @@ def build_dataset_frame(
         elif ft["dtype"] == "float32" and len(ft["shape"]) == 1:
             frame[key] = np.array([values[name] for name in ft["names"]], dtype=np.float32)
         elif ft["dtype"] in ["image", "video"]:
-            if ft["info"].get(ft["dtype"] + ".is_depth_map"):
-                frame[key] = values[key.removeprefix(f"{prefix}.depth_maps.")]
-            else:
-                frame[key] = values[key.removeprefix(f"{prefix}.images.")]
+            frame[key] = values[key.removeprefix(f"{prefix}.images.")]
 
     return frame
 
@@ -164,11 +158,11 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
             type = FeatureType.VISUAL
             if len(shape) != 3:
                 raise ValueError(f"Number of dimensions of {key} != 3 (shape={shape})")
-
-            names = ft["names"]
-            # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets.
-            if names[2] in ["channel", "channels"]:  # (h, w, c) -> (c, h, w)
-                shape = (shape[2], shape[0], shape[1])
+            else:
+                names = ft["names"]
+                # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets.
+                if names[2] in ["channel", "channels"]:  # (h, w, c) -> (c, h, w)
+                    shape = (shape[2], shape[0], shape[1])
         elif key == OBS_ENV_STATE:
             type = FeatureType.ENV
         elif key.startswith(OBS_STR):
diff --git a/tests/datasets/test_dataset_metadata.py b/tests/datasets/test_dataset_metadata.py
index 6d8b6f06f..2250746ef 100644
--- a/tests/datasets/test_dataset_metadata.py
+++ b/tests/datasets/test_dataset_metadata.py
@@ -59,7 +59,7 @@ DEPTH_FEATURES = {
         "dtype": "video",
         "shape": (64, 96, 1),
         "names": ["height", "width", "channels"],
-        "info": {"video.is_depth_map": True},
+        "info": {"is_depth_map": True},
     },
 }
 
@@ -155,7 +155,7 @@ def test_create_without_videos_has_no_video_path(tmp_path):
 
 
 def test_depth_keys_property_filters_by_marker(tmp_path):
-    """``depth_keys`` selects only video features carrying ``video.is_depth_map=True``."""
+    """``depth_keys`` selects only features carrying ``is_depth_map=True`` in info."""
     features = {
         **VIDEO_FEATURES,
         **DEPTH_FEATURES,
@@ -164,8 +164,8 @@ def test_depth_keys_property_filters_by_marker(tmp_path):
         repo_id="test/depth_keys", fps=DEFAULT_FPS, features=features, root=tmp_path / "depth_keys"
     )
 
-    assert set(meta.video_keys) == {"observation.images.laptop", "observation.depth.laptop"}
-    assert meta.depth_keys == ["observation.depth.laptop"]
+    assert set(meta.video_keys) == {"observation.images.laptop", "observation.images.laptop_depth"}
+    assert meta.depth_keys == ["observation.images.laptop_depth"]
 
 
 def test_depth_keys_empty_when_no_marker(tmp_path):
diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py
index 7e33aa63f..f21a1cdf7 100644
--- a/tests/datasets/test_video_encoding.py
+++ b/tests/datasets/test_video_encoding.py
@@ -368,7 +368,7 @@ class TestGetVideoInfo:
         assert info["video.pix_fmt"] == "yuv420p"
         assert info["video.fps"] == 30
         assert info["video.channels"] == 3
-        assert info["video.is_depth_map"] is False
+        assert info["is_depth_map"] is False
         assert info["has_audio"] is False
         assert "video.g" not in info
         assert "video.crf" not in info
@@ -463,7 +463,7 @@ class TestEncodeVideoFrames:
         assert info["video.codec"] == "av1"
         assert info["video.pix_fmt"] == "yuv420p"
         assert info["video.fps"] == 30
-        assert info["video.is_depth_map"] is False
+        assert info["is_depth_map"] is False
         assert info["has_audio"] is False
         # Encoder config
         assert info["video.g"] == 4
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index 4d578b503..23aca5b0b 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -39,7 +39,7 @@ DUMMY_VIDEO_INFO = {
     "video.crf": 30,
     "video.preset": 12,
     "video.fast_decode": 0,
-    "video.is_depth_map": False,
+    "is_depth_map": False,
     "has_audio": False,
 }
 DUMMY_CAMERA_FEATURES = {