From 006ca66a667ac65c7706c9b6694fbb786cd9419a Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Tue, 30 Jun 2026 22:05:21 +0200
Subject: [PATCH] fix(depth unit): storing raw depth units in the dataset
 metadata for correct depth statistics and depth raw frames handling. The unit
 is stored as a string ("m","mm") under "depth_unit" at the same level as
 "is_depth_map". Unit is inferred from the depth frame type.

---
 src/lerobot/configs/__init__.py        |  4 ++++
 src/lerobot/datasets/compute_stats.py  |  2 +-
 src/lerobot/datasets/dataset_writer.py | 10 ++++++++++
 src/lerobot/datasets/depth_utils.py    | 26 +++++++++++++++-----------
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py
index 168b367db..20f81fb18 100644
--- a/src/lerobot/configs/__init__.py
+++ b/src/lerobot/configs/__init__.py
@@ -34,6 +34,8 @@ from .types import (
 )
 from .video import (
     DEFAULT_DEPTH_UNIT,
+    DEPTH_METER_UNIT,
+    DEPTH_MILLIMETER_UNIT,
     VALID_VIDEO_CODECS,
     VIDEO_ENCODER_INFO_KEYS,
     DepthEncoderConfig,
@@ -72,6 +74,8 @@ __all__ = [
     "encoder_config_from_video_info",
     # Constants
     "DEFAULT_DEPTH_UNIT",
+    "DEPTH_METER_UNIT",
+    "DEPTH_MILLIMETER_UNIT",
     "VALID_VIDEO_CODECS",
     "VIDEO_ENCODER_INFO_KEYS",
 ]
diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py
index 88f7ea226..02ecd81a4 100644
--- a/src/lerobot/datasets/compute_stats.py
+++ b/src/lerobot/datasets/compute_stats.py
@@ -509,7 +509,7 @@ def compute_episode_stats(
         For 'image'/'video' features, stats are computed per channel and kept with a
         leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
         255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
-        this rescaling and remain in their stored units.
+        this rescaling and remain in their stored units (stored in ``depth_unit``).
     """
     if quantile_list is None:
         quantile_list = DEFAULT_QUANTILES
diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py
index 1aee1497c..f8bf0eddb 100644
--- a/src/lerobot/datasets/dataset_writer.py
+++ b/src/lerobot/datasets/dataset_writer.py
@@ -41,6 +41,7 @@ from lerobot.configs import (
 
 from .compute_stats import compute_episode_stats
 from .dataset_metadata import LeRobotDatasetMetadata
+from .depth_utils import infer_depth_unit
 from .feature_utils import (
     get_hf_features_from_features,
     validate_episode_buffer,
@@ -209,6 +210,15 @@ class DatasetWriter:
         self.episode_buffer["timestamp"].append(timestamp)
         self.episode_buffer["task"].append(frame.pop("task"))
 
+        # Record each depth feature's input unit once, inferred from the first frame's dtype.
+        if frame_index == 0:
+            for depth_key in self._meta.depth_keys:
+                if depth_key not in frame:
+                    continue
+                info = self._meta.features[depth_key].setdefault("info", {})
+                if info.get("depth_unit") is None:
+                    info["depth_unit"] = infer_depth_unit(np.asarray(frame[depth_key]).dtype)
+
         # Start streaming encoder on first frame of episode
         if frame_index == 0 and self._streaming_encoder is not None:
             self._streaming_encoder.start_episode(
diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py
index 801c86a09..04aa9a54b 100644
--- a/src/lerobot/datasets/depth_utils.py
+++ b/src/lerobot/datasets/depth_utils.py
@@ -39,10 +39,18 @@ from lerobot.configs.video import (
 from .image_writer import squeeze_single_channel
 from .pyav_utils import write_u16_plane
 
-_MM_PER_METRE = 1000.0
+MM_PER_METRE = 1000.0
 _UINT16_MAX = 65535
 
 
+def infer_depth_unit(dtype: np.dtype | type) -> str:
+    """Infer the physical unit of raw depth frames from their dtype.
+
+    Floating-point frames are assumed to be in metres, integer frames in millimetres.
+    """
+    return DEPTH_METER_UNIT if np.issubdtype(np.dtype(dtype), np.floating) else DEPTH_MILLIMETER_UNIT
+
+
 def _validate_log_quant_params(depth_min: float, shift: float) -> None:
     """Ensure ``log(depth_min + shift)`` is finite."""
     if depth_min + shift <= 0:
@@ -57,11 +65,7 @@ def _depth_input_to_float32_and_unit(
     input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT],
 ) -> tuple[NDArray[np.float32], Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]]:
     """Convert depth to float32 in the chosen unit, and return the resolved unit."""
-    resolved_unit = (
-        (DEPTH_METER_UNIT if np.issubdtype(depth.dtype, np.floating) else DEPTH_MILLIMETER_UNIT)
-        if input_unit == "auto"
-        else input_unit
-    )
+    resolved_unit = infer_depth_unit(depth.dtype) if input_unit == "auto" else input_unit
     return depth.astype(np.float32, order="K"), resolved_unit
 
 
@@ -126,12 +130,12 @@ def quantize_depth(
 
     # Convert depth_min, depth_max, and shift to the resolved input unit.
     depth_min_u = (
-        np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
+        np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
     )
     depth_max_u = (
-        np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
+        np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
     )
-    shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
+    shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
 
     # Normalization and quantization is performed in the resolved input unit.
     if use_log:
@@ -236,7 +240,7 @@ def dequantize_depth(
 
         # mm path: round + clamp in float32, skipping the uint16 round-trip
         # when returning a tensor (torch.uint16 is poorly supported).
-        buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
+        buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
         if output_tensor:
             return buf
         return buf.cpu().numpy().astype(np.uint16, copy=False)
@@ -259,7 +263,7 @@ def dequantize_depth(
     if output_unit == DEPTH_METER_UNIT:
         return torch.from_numpy(buf) if output_tensor else buf
 
-    np.multiply(buf, _MM_PER_METRE, out=buf)
+    np.multiply(buf, MM_PER_METRE, out=buf)
     np.rint(buf, out=buf)
     np.clip(buf, 0.0, _UINT16_MAX, out=buf)
     if output_tensor: