diff --git a/src/lerobot/datasets/dataset_reader.py b/src/lerobot/datasets/dataset_reader.py index 88267e55a..927cd9f8c 100644 --- a/src/lerobot/datasets/dataset_reader.py +++ b/src/lerobot/datasets/dataset_reader.py @@ -266,7 +266,6 @@ class DatasetReader: depth_max=depth_encoder.depth_max, shift=depth_encoder.shift, use_log=depth_encoder.use_log, - output_tensor=True, ) return vid_key, frames.squeeze(0) diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py index 138df9009..e2aa2db59 100644 --- a/src/lerobot/datasets/depth_utils.py +++ b/src/lerobot/datasets/depth_utils.py @@ -143,72 +143,115 @@ def quantize_depth( def dequantize_depth( - quantized: NDArray[np.uint16] | av.VideoFrame, + quantized: NDArray[np.uint16] | av.VideoFrame | torch.Tensor, depth_min: float = DEFAULT_DEPTH_MIN, depth_max: float = DEFAULT_DEPTH_MAX, shift: float = DEFAULT_DEPTH_SHIFT, use_log: bool = DEFAULT_DEPTH_USE_LOG, pix_fmt: str = DEFAULT_DEPTH_PIX_FMT, output_unit: Literal["m", "mm"] = "mm", - output_tensor: bool = False, + output_tensor: bool = True, + output_channel_last: bool = False, ) -> NDArray[np.uint16] | NDArray[np.float32] | torch.Tensor: """Inverse of :func:`quantize_depth`. - Tuning arguments **must match** :func:`quantize_depth`. - Decoding inverts the same normalized code mapping as :func:`quantize_depth` using ``depth_min`` / ``depth_max`` / ``shift`` (in metres), then returns - the requested output unit. + the requested output unit. Tuning arguments **must match** :func:`quantize_depth`. + + Accepted input layouts : + + - ``(H, W, 1)`` or ``(H, W)`` — single frame with channel-last. + - ``(..., 1, H, W)`` — batched frames with channel-first. + - ``(..., H, W, 1)`` — batched frames with channel-last. + Output layout is determined by ``output_channel_last``. Args: - quantized: 12-bit codes ``[0, DEPTH_QMAX]``, ``dtype=uint16``. + quantized: 12-bit codes in ``[0, DEPTH_QMAX]``. ``np.ndarray``, + ``av.VideoFrame``, or ``torch.Tensor`` (any integer or float dtype). depth_min, depth_max, shift, use_log: Same as :func:`quantize_depth` (metres). - output_unit: ``\"mm\"`` returns ``uint16`` millimetres (``rint``, clip - ``[0, 65535]``). ``\"m\"`` returns ``float32`` metres in + pix_fmt: Pixel format used to extract the plane from an ``av.VideoFrame``. + output_unit: ``"mm"`` returns ``uint16`` millimetres (rint, clip + ``[0, 65535]``) when returning a numpy array, or ``float32`` mm when + ``output_tensor=True``. ``"m"`` returns ``float32`` metres in ``[depth_min, depth_max]``. - output_tensor: If True, return a torch.Tensor instead of a numpy array. + output_tensor: If True, return a ``torch.Tensor`` instead of a numpy array. Returns: Depth map in the requested unit and dtype. Raises: + ValueError: If ``output_unit`` is not ``"m"`` or ``"mm"``. ValueError: If ``use_log=True`` and ``depth_min + shift <= 0``. - ValueError: If ``output_unit`` is not ``\"m\"`` or ``\"mm\"``. """ if output_unit not in ("m", "mm"): raise ValueError(f"output_unit must be 'm' or 'mm', got {output_unit!r}") + if use_log: + _validate_log_quant_params(depth_min, shift) if isinstance(quantized, av.VideoFrame): quantized = quantized.to_ndarray(format=pix_fmt) - norm = np.asarray(quantized, dtype=np.float32, order="K") / DEPTH_QMAX - - depth_min_m = np.float32(depth_min) - depth_max_m = np.float32(depth_max) - shift_m = np.float32(shift) - - # The de-normalization and de-quantization is performed in meters (convenience choice). + # Compute the scale and offset first. + depth_min_m = float(depth_min) + depth_max_m = float(depth_max) + shift_m = float(shift) if use_log: - _validate_log_quant_params(depth_min, shift) - log_min = math.log(float(depth_min_m + shift_m)) - log_max = math.log(float(depth_max_m + shift_m)) - depth_m = np.exp(norm * (log_max - log_min) + log_min) - shift_m + log_min = math.log(depth_min_m + shift_m) + log_max = math.log(depth_max_m + shift_m) + scale = (log_max - log_min) / DEPTH_QMAX + offset = log_min else: - depth_m = norm * (depth_max_m - depth_min_m) + depth_min_m - depth_m = np.clip(depth_m, depth_min_m, depth_max_m).astype(np.float32, copy=False) + scale = (depth_max_m - depth_min_m) / DEPTH_QMAX + offset = depth_min_m - # Add single-channel dim: (H, W) → (H, W, 1) - if depth_m.ndim == 2: - depth_m = depth_m[..., np.newaxis] + # ── Torch path: stay on the input device, single fp32 allocation. ──────── + if isinstance(quantized, torch.Tensor): + + if quantized.ndim >= 3: + # Drop the single-channel dimension so the math runs on (..., H, W). + quantized = quantized.squeeze(-3) if quantized.shape[-3] == 1 else quantized.squeeze(-1) + + # Single allocation we own; everything else is in-place. + buf = quantized.to(dtype=torch.float32, copy=True) + buf.mul_(scale).add_(offset) + if use_log: + buf.exp_().sub_(shift_m) + buf.clamp_(depth_min_m, depth_max_m) + buf.unsqueeze_(-1) if output_channel_last else buf.unsqueeze_(-3) + + if output_unit == "m": + return buf if output_tensor else buf.cpu().numpy() + + # mm path: round + clamp in float32, skipping the uint16 round-trip + # when returning a tensor (torch.uint16 is poorly supported). + buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) + if output_tensor: + return buf + return buf.cpu().numpy().astype(np.uint16, copy=False) + + # ── NumPy path: single fp32 allocation, ``out=`` for in-place math. ───── + arr = np.asarray(quantized) + if arr.ndim >= 3: + # Drop the single-channel dimension so the math runs on (..., H, W). + arr = np.squeeze(arr, axis=-3) if arr.shape[-3] == 1 else np.squeeze(arr, axis=-1) + + buf = np.empty(arr.shape, dtype=np.float32) + np.multiply(arr, scale, out=buf) + np.add(buf, offset, out=buf) + if use_log: + np.exp(buf, out=buf) + np.subtract(buf, shift_m, out=buf) + np.clip(buf, depth_min_m, depth_max_m, out=buf) + buf = np.expand_dims(buf, axis=-1) if output_channel_last else np.expand_dims(buf, axis=-3) - # Return depth as float32 meters. if output_unit == "m": - return torch.from_numpy(depth_m) if output_tensor else depth_m + return torch.from_numpy(buf) if output_tensor else buf - # Return depth as uint16 millimeters. - mm = np.rint(depth_m * _MM_PER_METRE).clip(0, _UINT16_MAX).astype(np.uint16, copy=False) + np.multiply(buf, _MM_PER_METRE, out=buf) + np.rint(buf, out=buf) + np.clip(buf, 0.0, _UINT16_MAX, out=buf) if output_tensor: - # torch.uint16 support is very limited, we convert to float32 instead. - return torch.from_numpy(mm.astype(np.float32)) - else: - return mm + # torch.uint16 support is very limited; return float32 millimetres. + return torch.from_numpy(buf) + return buf.astype(np.uint16, copy=False)