docs(depth stats): updating docs

test(depth stats): updating tests
feat(depth stats): enforcing all depth stats to be in millimeters (default unit) for consistency
2026-07-02 15:47:05 +00:00 · 2026-07-01 18:14:01 +02:00 · 2026-07-01 18:14:01 +02:00 · 2026-07-01 18:14:01 +02:00 · 2026-07-01 17:05:43 +02:00
7 changed files with 86 additions and 19 deletions
@@ -134,6 +134,9 @@ lerobot-train \
 > [!TIP]
 > This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras.

+> [!IMPORTANT]
+> Depth statistics in `meta/stats.json` are always computed in **millimetres**, regardless of the raw frame dtype.
+
 ---

 ## Persistence in dataset metadata
@@ -22,6 +22,7 @@ import numpy as np
 from lerobot.processor import RelativeActionsProcessorStep
 from lerobot.utils.constants import ACTION, OBS_STATE

+from .depth_utils import MM_PER_METRE
 from .io_utils import load_image_as_numpy

 DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99]
@@ -508,8 +509,8 @@ def compute_episode_stats(
    Note:
        For 'image'/'video' features, stats are computed per channel and kept with a
        leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
-        255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
-        this rescaling and remain in their stored units.
+        255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) are
+        instead canonicalized to millimetres regardless of the raw frame unit.
    """
    if quantile_list is None:
        quantile_list = DEFAULT_QUANTILES
@@ -533,9 +534,14 @@ def compute_episode_stats(
        )

        if features[key]["dtype"] in ["image", "video"]:
-            normalization_factor = (
-                255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0
-            )
+            if (features[key].get("info") or {}).get("is_depth_map", False):
+                # Depth stats are canonically stored in millimetres; metre (float) depth is
+                # scaled up, integer (millimetre) depth is left as-is.
+                normalization_factor = (
+                    1.0 / MM_PER_METRE if np.issubdtype(ep_ft_array.dtype, np.floating) else 1.0
+                )
+            else:
+                normalization_factor = 255.0
            ep_stats[key] = {
                k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0)
                for k, v in ep_stats[key].items()
@@ -39,7 +39,7 @@ from lerobot.configs.video import (
 from .image_writer import squeeze_single_channel
 from .pyav_utils import write_u16_plane

-_MM_PER_METRE = 1000.0
+MM_PER_METRE = 1000.0
 _UINT16_MAX = 65535


@@ -126,12 +126,12 @@ def quantize_depth(

    # Convert depth_min, depth_max, and shift to the resolved input unit.
    depth_min_u = (
-        np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
+        np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
    )
    depth_max_u = (
-        np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
+        np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
    )
-    shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
+    shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)

    # Normalization and quantization is performed in the resolved input unit.
    if use_log:
@@ -236,7 +236,7 @@ def dequantize_depth(

        # mm path: round + clamp in float32, skipping the uint16 round-trip
        # when returning a tensor (torch.uint16 is poorly supported).
-        buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
+        buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
        if output_tensor:
            return buf
        return buf.cpu().numpy().astype(np.uint16, copy=False)
@@ -259,7 +259,7 @@ def dequantize_depth(
    if output_unit == DEPTH_METER_UNIT:
        return torch.from_numpy(buf) if output_tensor else buf

-    np.multiply(buf, _MM_PER_METRE, out=buf)
+    np.multiply(buf, MM_PER_METRE, out=buf)
    np.rint(buf, out=buf)
    np.clip(buf, 0.0, _UINT16_MAX, out=buf)
    if output_tensor:
@@ -47,7 +47,7 @@ from lerobot.configs import (
 )
 from lerobot.utils.import_utils import get_safe_default_video_backend

-from .depth_utils import quantize_depth
+from .depth_utils import MM_PER_METRE, quantize_depth
 from .pyav_utils import get_pix_fmt_channels

 logger = logging.getLogger(__name__)
@@ -848,6 +848,9 @@ class _CameraEncoderThread(threading.Thread):
                # Reshape CHW to (H*W, C) for per-channel stats
                channels = img_downsampled.shape[0]
                img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
+                # Depth stats are canonically stored in millimetres; metre (float) depth is scaled up.
+                if self.is_depth and np.issubdtype(frame_data.dtype, np.floating):
+                    img_for_stats = img_for_stats * MM_PER_METRE
                stats_tracker.update(img_for_stats)

                frame_count += 1
@@ -14,14 +14,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import functools
 import traceback

+import draccus.wrappers.docstring as _draccus_docstring
 import pytest

 from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
 from lerobot.utils.import_utils import is_package_available
 from tests.utils import DEVICE

+# On every `draccus.parse()`, draccus rebuilds each dataclass field's help text by
+# re-reading and re-parsing the class source (draccus.wrappers.docstring). For a config
+# as large as TrainPipelineConfig this costs ~2.5s per parse — negligible for the single
+# parse a CLI does, but tests parse configs hundreds of times. The source can't change
+# within a run, so memoize it for the whole test session.
+_draccus_docstring.get_attribute_docstring = functools.cache(_draccus_docstring.get_attribute_docstring)
+
 # Import fixture modules as plugins.
 # Fixtures that depend on optional packages are only registered when those packages are available,
 # so that tests can be collected and run even with a minimal install.
@@ -245,3 +245,44 @@ class TestFeatureFileRouting:

        dataset.save_episode()
        dataset.finalize()
+
+
+# ── 5. Depth stats unit canonicalization (millimetres) ────────────────
+
+
+class TestDepthStatsUnit:
+    """Depth stats are always stored in millimetres, regardless of raw frame dtype."""
+
+    NUM_FRAMES = 4
+
+    @pytest.mark.parametrize("use_videos", [False, True])
+    def test_stats_canonicalized_to_mm(self, tmp_path, features_factory, use_videos):
+        """Float (metre) and integer (millimetre) depth over the same physical range
+        yield identical millimetre-scale stats."""
+        from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+        def _record(depth_dtype, root):
+            features = features_factory(
+                camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos
+            )
+            dataset = LeRobotDataset.create(
+                repo_id=DUMMY_REPO_ID,
+                fps=DEFAULT_FPS,
+                features=features,
+                root=root,
+                use_videos=use_videos,
+                streaming_encoding=use_videos,
+            )
+            add_frames(dataset, num_frames=self.NUM_FRAMES, depth_dtype=depth_dtype)
+            dataset.save_episode()
+            dataset.finalize()
+            return np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)
+
+        # add_frames ramps float depth over 0.1–10 m and integer depth over 100–10000 mm
+        # (the same physical range), so canonicalized stats must match.
+        mean_m = _record(np.float32, tmp_path / "ds_m")
+        mean_mm = _record(np.uint16, tmp_path / "ds_mm")
+
+        # Float (metre) input is scaled to millimetres, not left in the single-digit metre range.
+        assert mean_m.item() > 50.0
+        np.testing.assert_allclose(mean_m, mean_mm, rtol=0.05)
@@ -49,16 +49,18 @@ from tests.fixtures.constants import (
 )


-def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
+def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype = np.uint16) -> None:
    """Append ``num_frames`` synthetic frames to ``dataset``.

-    Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for
-    keys in ``dataset.meta.depth_keys``, uint8 random noise for video/image keys,
-    and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp,
-    frame_index, ...) are auto-populated by ``add_frame`` and skipped here.
+    Generates per-feature payloads from ``dataset.meta``: depth ramps (``depth_dtype``,
+    default ``uint16`` millimetres; pass ``np.float32`` for metres) for keys in
+    ``dataset.meta.depth_keys``, uint8 random noise for video/image keys, and float32
+    zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, frame_index, ...) are
+    auto-populated by ``add_frame`` and skipped here.
    """
    video_keys = dataset.meta.video_keys
    depth_keys = dataset.meta.depth_keys
+    depth_is_float = np.issubdtype(depth_dtype, np.floating)
    # Smooth gradient base reused per (H, W) to keep depth frames cheap to
    # encode (HEVC Main 12 hates white noise).
    _depth_base_cache: dict[tuple[int, int], np.ndarray] = {}
@@ -70,11 +72,14 @@ def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
            shape = ft["shape"]
            if key in depth_keys:
                h, w, _ = shape
+                # Float depth is expressed in metres, integer depth in millimetres.
+                lo, hi = (0.1, 10.0) if depth_is_float else (100.0, 10_000.0)
                base = _depth_base_cache.setdefault(
                    (h, w),
-                    np.linspace(100.0, 10_000.0, h * w, dtype=np.float32).reshape(h, w, 1),
+                    np.linspace(lo, hi, h * w, dtype=np.float32).reshape(h, w, 1),
                )
-                frame[key] = (base + 50.0 * i).clip(0, 65535).astype(np.uint16)
+                step = (0.05 if depth_is_float else 50.0) * i
+                frame[key] = (base + step).clip(0, 65535).astype(depth_dtype)
            elif key in video_keys:
                frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
            else:
Author	SHA1	Message	Date
CarolinePascal	f8728bde84	docs(depth stats): updating docs	2026-07-01 18:14:01 +02:00
CarolinePascal	d3fd459f81	test(depth stats): updating tests	2026-07-01 18:14:01 +02:00
CarolinePascal	ed29db6d22	feat(depth stats): enforcing all depth stats to be in millimeters (default unit) for consistency	2026-07-01 18:14:01 +02:00
Nicolas Rabault	e623733861	perf(tests): cache draccus docstring extraction (#3903 ) draccus re-parses each config class's source on every parse() to extract field help text (~2.5s for TrainPipelineConfig). Memoize it for the test session; the source is constant within a run. Fast Tests test time: 664s -> 404s (-39%).	2026-07-01 17:05:43 +02:00