From eba1d1bd0c3d8413b304832ec7aeafce0de6e710 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Tue, 30 Jun 2026 13:50:57 +0200 Subject: [PATCH] feat(depth stats): enforcing all depth stats to be in millimeters (default unit) for consistency --- src/lerobot/datasets/compute_stats.py | 16 +++++++++++----- src/lerobot/datasets/depth_utils.py | 12 ++++++------ src/lerobot/datasets/video_utils.py | 5 ++++- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py index 88f7ea226..524a86b93 100644 --- a/src/lerobot/datasets/compute_stats.py +++ b/src/lerobot/datasets/compute_stats.py @@ -22,6 +22,7 @@ import numpy as np from lerobot.processor import RelativeActionsProcessorStep from lerobot.utils.constants import ACTION, OBS_STATE +from .depth_utils import MM_PER_METRE from .io_utils import load_image_as_numpy DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99] @@ -508,8 +509,8 @@ def compute_episode_stats( Note: For 'image'/'video' features, stats are computed per channel and kept with a leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by - 255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip - this rescaling and remain in their stored units. + 255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) are + instead canonicalized to millimetres regardless of the raw frame unit. """ if quantile_list is None: quantile_list = DEFAULT_QUANTILES @@ -533,9 +534,14 @@ def compute_episode_stats( ) if features[key]["dtype"] in ["image", "video"]: - normalization_factor = ( - 255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0 - ) + if (features[key].get("info") or {}).get("is_depth_map", False): + # Depth stats are canonically stored in millimetres; metre (float) depth is + # scaled up, integer (millimetre) depth is left as-is. + normalization_factor = ( + 1.0 / MM_PER_METRE if np.issubdtype(ep_ft_array.dtype, np.floating) else 1.0 + ) + else: + normalization_factor = 255.0 ep_stats[key] = { k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0) for k, v in ep_stats[key].items() diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py index 801c86a09..5251b4e0b 100644 --- a/src/lerobot/datasets/depth_utils.py +++ b/src/lerobot/datasets/depth_utils.py @@ -39,7 +39,7 @@ from lerobot.configs.video import ( from .image_writer import squeeze_single_channel from .pyav_utils import write_u16_plane -_MM_PER_METRE = 1000.0 +MM_PER_METRE = 1000.0 _UINT16_MAX = 65535 @@ -126,12 +126,12 @@ def quantize_depth( # Convert depth_min, depth_max, and shift to the resolved input unit. depth_min_u = ( - np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE) + np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE) ) depth_max_u = ( - np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE) + np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE) ) - shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE) + shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE) # Normalization and quantization is performed in the resolved input unit. if use_log: @@ -236,7 +236,7 @@ def dequantize_depth( # mm path: round + clamp in float32, skipping the uint16 round-trip # when returning a tensor (torch.uint16 is poorly supported). - buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) + buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) if output_tensor: return buf return buf.cpu().numpy().astype(np.uint16, copy=False) @@ -259,7 +259,7 @@ def dequantize_depth( if output_unit == DEPTH_METER_UNIT: return torch.from_numpy(buf) if output_tensor else buf - np.multiply(buf, _MM_PER_METRE, out=buf) + np.multiply(buf, MM_PER_METRE, out=buf) np.rint(buf, out=buf) np.clip(buf, 0.0, _UINT16_MAX, out=buf) if output_tensor: diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index ef3005dd8..6f8c54f4a 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -47,7 +47,7 @@ from lerobot.configs import ( ) from lerobot.utils.import_utils import get_safe_default_video_backend -from .depth_utils import quantize_depth +from .depth_utils import MM_PER_METRE, quantize_depth from .pyav_utils import get_pix_fmt_channels logger = logging.getLogger(__name__) @@ -848,6 +848,9 @@ class _CameraEncoderThread(threading.Thread): # Reshape CHW to (H*W, C) for per-channel stats channels = img_downsampled.shape[0] img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels) + # Depth stats are canonically stored in millimetres; metre (float) depth is scaled up. + if self.is_depth and np.issubdtype(frame_data.dtype, np.floating): + img_for_stats = img_for_stats * MM_PER_METRE stats_tracker.update(img_for_stats) frame_count += 1