From 006ca66a667ac65c7706c9b6694fbb786cd9419a Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Tue, 30 Jun 2026 22:05:21 +0200 Subject: [PATCH] fix(depth unit): storing raw depth units in the dataset metadata for correct depth statistics and depth raw frames handling. The unit is stored as a string ("m","mm") under "depth_unit" at the same level as "is_depth_map". Unit is inferred from the depth frame type. --- src/lerobot/configs/__init__.py | 4 ++++ src/lerobot/datasets/compute_stats.py | 2 +- src/lerobot/datasets/dataset_writer.py | 10 ++++++++++ src/lerobot/datasets/depth_utils.py | 26 +++++++++++++++----------- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py index 168b367db..20f81fb18 100644 --- a/src/lerobot/configs/__init__.py +++ b/src/lerobot/configs/__init__.py @@ -34,6 +34,8 @@ from .types import ( ) from .video import ( DEFAULT_DEPTH_UNIT, + DEPTH_METER_UNIT, + DEPTH_MILLIMETER_UNIT, VALID_VIDEO_CODECS, VIDEO_ENCODER_INFO_KEYS, DepthEncoderConfig, @@ -72,6 +74,8 @@ __all__ = [ "encoder_config_from_video_info", # Constants "DEFAULT_DEPTH_UNIT", + "DEPTH_METER_UNIT", + "DEPTH_MILLIMETER_UNIT", "VALID_VIDEO_CODECS", "VIDEO_ENCODER_INFO_KEYS", ] diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py index 88f7ea226..02ecd81a4 100644 --- a/src/lerobot/datasets/compute_stats.py +++ b/src/lerobot/datasets/compute_stats.py @@ -509,7 +509,7 @@ def compute_episode_stats( For 'image'/'video' features, stats are computed per channel and kept with a leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by 255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip - this rescaling and remain in their stored units. + this rescaling and remain in their stored units (stored in ``depth_unit``). """ if quantile_list is None: quantile_list = DEFAULT_QUANTILES diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index 1aee1497c..f8bf0eddb 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -41,6 +41,7 @@ from lerobot.configs import ( from .compute_stats import compute_episode_stats from .dataset_metadata import LeRobotDatasetMetadata +from .depth_utils import infer_depth_unit from .feature_utils import ( get_hf_features_from_features, validate_episode_buffer, @@ -209,6 +210,15 @@ class DatasetWriter: self.episode_buffer["timestamp"].append(timestamp) self.episode_buffer["task"].append(frame.pop("task")) + # Record each depth feature's input unit once, inferred from the first frame's dtype. + if frame_index == 0: + for depth_key in self._meta.depth_keys: + if depth_key not in frame: + continue + info = self._meta.features[depth_key].setdefault("info", {}) + if info.get("depth_unit") is None: + info["depth_unit"] = infer_depth_unit(np.asarray(frame[depth_key]).dtype) + # Start streaming encoder on first frame of episode if frame_index == 0 and self._streaming_encoder is not None: self._streaming_encoder.start_episode( diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py index 801c86a09..04aa9a54b 100644 --- a/src/lerobot/datasets/depth_utils.py +++ b/src/lerobot/datasets/depth_utils.py @@ -39,10 +39,18 @@ from lerobot.configs.video import ( from .image_writer import squeeze_single_channel from .pyav_utils import write_u16_plane -_MM_PER_METRE = 1000.0 +MM_PER_METRE = 1000.0 _UINT16_MAX = 65535 +def infer_depth_unit(dtype: np.dtype | type) -> str: + """Infer the physical unit of raw depth frames from their dtype. + + Floating-point frames are assumed to be in metres, integer frames in millimetres. + """ + return DEPTH_METER_UNIT if np.issubdtype(np.dtype(dtype), np.floating) else DEPTH_MILLIMETER_UNIT + + def _validate_log_quant_params(depth_min: float, shift: float) -> None: """Ensure ``log(depth_min + shift)`` is finite.""" if depth_min + shift <= 0: @@ -57,11 +65,7 @@ def _depth_input_to_float32_and_unit( input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT], ) -> tuple[NDArray[np.float32], Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]]: """Convert depth to float32 in the chosen unit, and return the resolved unit.""" - resolved_unit = ( - (DEPTH_METER_UNIT if np.issubdtype(depth.dtype, np.floating) else DEPTH_MILLIMETER_UNIT) - if input_unit == "auto" - else input_unit - ) + resolved_unit = infer_depth_unit(depth.dtype) if input_unit == "auto" else input_unit return depth.astype(np.float32, order="K"), resolved_unit @@ -126,12 +130,12 @@ def quantize_depth( # Convert depth_min, depth_max, and shift to the resolved input unit. depth_min_u = ( - np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE) + np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE) ) depth_max_u = ( - np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE) + np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE) ) - shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE) + shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE) # Normalization and quantization is performed in the resolved input unit. if use_log: @@ -236,7 +240,7 @@ def dequantize_depth( # mm path: round + clamp in float32, skipping the uint16 round-trip # when returning a tensor (torch.uint16 is poorly supported). - buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) + buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) if output_tensor: return buf return buf.cpu().numpy().astype(np.uint16, copy=False) @@ -259,7 +263,7 @@ def dequantize_depth( if output_unit == DEPTH_METER_UNIT: return torch.from_numpy(buf) if output_tensor else buf - np.multiply(buf, _MM_PER_METRE, out=buf) + np.multiply(buf, MM_PER_METRE, out=buf) np.rint(buf, out=buf) np.clip(buf, 0.0, _UINT16_MAX, out=buf) if output_tensor: