Compare commits

...

3 Commits

Author SHA1 Message Date
CarolinePascal 30b79a56f7 docs(depth stats): updating docs 2026-06-30 14:06:41 +02:00
CarolinePascal 036d99bf74 test(depth stats): updating tests 2026-06-30 13:51:07 +02:00
CarolinePascal eba1d1bd0c feat(depth stats): enforcing all depth stats to be in millimeters (default unit) for consistency 2026-06-30 13:50:57 +02:00
6 changed files with 77 additions and 19 deletions
@@ -134,6 +134,9 @@ lerobot-train \
> [!TIP] > [!TIP]
> This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras. > This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras.
> [!IMPORTANT]
> Depth statistics in `meta/stats.json` are always computed in **millimetres**, regardless of the raw frame dtype.
--- ---
## Persistence in dataset metadata ## Persistence in dataset metadata
+11 -5
View File
@@ -22,6 +22,7 @@ import numpy as np
from lerobot.processor import RelativeActionsProcessorStep from lerobot.processor import RelativeActionsProcessorStep
from lerobot.utils.constants import ACTION, OBS_STATE from lerobot.utils.constants import ACTION, OBS_STATE
from .depth_utils import MM_PER_METRE
from .io_utils import load_image_as_numpy from .io_utils import load_image_as_numpy
DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99] DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99]
@@ -508,8 +509,8 @@ def compute_episode_stats(
Note: Note:
For 'image'/'video' features, stats are computed per channel and kept with a For 'image'/'video' features, stats are computed per channel and kept with a
leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip 255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) are
this rescaling and remain in their stored units. instead canonicalized to millimetres regardless of the raw frame unit.
""" """
if quantile_list is None: if quantile_list is None:
quantile_list = DEFAULT_QUANTILES quantile_list = DEFAULT_QUANTILES
@@ -533,9 +534,14 @@ def compute_episode_stats(
) )
if features[key]["dtype"] in ["image", "video"]: if features[key]["dtype"] in ["image", "video"]:
normalization_factor = ( if (features[key].get("info") or {}).get("is_depth_map", False):
255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0 # Depth stats are canonically stored in millimetres; metre (float) depth is
) # scaled up, integer (millimetre) depth is left as-is.
normalization_factor = (
1.0 / MM_PER_METRE if np.issubdtype(ep_ft_array.dtype, np.floating) else 1.0
)
else:
normalization_factor = 255.0
ep_stats[key] = { ep_stats[key] = {
k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0) k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0)
for k, v in ep_stats[key].items() for k, v in ep_stats[key].items()
+6 -6
View File
@@ -39,7 +39,7 @@ from lerobot.configs.video import (
from .image_writer import squeeze_single_channel from .image_writer import squeeze_single_channel
from .pyav_utils import write_u16_plane from .pyav_utils import write_u16_plane
_MM_PER_METRE = 1000.0 MM_PER_METRE = 1000.0
_UINT16_MAX = 65535 _UINT16_MAX = 65535
@@ -126,12 +126,12 @@ def quantize_depth(
# Convert depth_min, depth_max, and shift to the resolved input unit. # Convert depth_min, depth_max, and shift to the resolved input unit.
depth_min_u = ( depth_min_u = (
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE) np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
) )
depth_max_u = ( depth_max_u = (
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE) np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
) )
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE) shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
# Normalization and quantization is performed in the resolved input unit. # Normalization and quantization is performed in the resolved input unit.
if use_log: if use_log:
@@ -236,7 +236,7 @@ def dequantize_depth(
# mm path: round + clamp in float32, skipping the uint16 round-trip # mm path: round + clamp in float32, skipping the uint16 round-trip
# when returning a tensor (torch.uint16 is poorly supported). # when returning a tensor (torch.uint16 is poorly supported).
buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
if output_tensor: if output_tensor:
return buf return buf
return buf.cpu().numpy().astype(np.uint16, copy=False) return buf.cpu().numpy().astype(np.uint16, copy=False)
@@ -259,7 +259,7 @@ def dequantize_depth(
if output_unit == DEPTH_METER_UNIT: if output_unit == DEPTH_METER_UNIT:
return torch.from_numpy(buf) if output_tensor else buf return torch.from_numpy(buf) if output_tensor else buf
np.multiply(buf, _MM_PER_METRE, out=buf) np.multiply(buf, MM_PER_METRE, out=buf)
np.rint(buf, out=buf) np.rint(buf, out=buf)
np.clip(buf, 0.0, _UINT16_MAX, out=buf) np.clip(buf, 0.0, _UINT16_MAX, out=buf)
if output_tensor: if output_tensor:
+4 -1
View File
@@ -47,7 +47,7 @@ from lerobot.configs import (
) )
from lerobot.utils.import_utils import get_safe_default_video_backend from lerobot.utils.import_utils import get_safe_default_video_backend
from .depth_utils import quantize_depth from .depth_utils import MM_PER_METRE, quantize_depth
from .pyav_utils import get_pix_fmt_channels from .pyav_utils import get_pix_fmt_channels
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -848,6 +848,9 @@ class _CameraEncoderThread(threading.Thread):
# Reshape CHW to (H*W, C) for per-channel stats # Reshape CHW to (H*W, C) for per-channel stats
channels = img_downsampled.shape[0] channels = img_downsampled.shape[0]
img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels) img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
# Depth stats are canonically stored in millimetres; metre (float) depth is scaled up.
if self.is_depth and np.issubdtype(frame_data.dtype, np.floating):
img_for_stats = img_for_stats * MM_PER_METRE
stats_tracker.update(img_for_stats) stats_tracker.update(img_for_stats)
frame_count += 1 frame_count += 1
+41
View File
@@ -245,3 +245,44 @@ class TestFeatureFileRouting:
dataset.save_episode() dataset.save_episode()
dataset.finalize() dataset.finalize()
# ── 5. Depth stats unit canonicalization (millimetres) ────────────────
class TestDepthStatsUnit:
"""Depth stats are always stored in millimetres, regardless of raw frame dtype."""
NUM_FRAMES = 4
@pytest.mark.parametrize("use_videos", [False, True])
def test_stats_canonicalized_to_mm(self, tmp_path, features_factory, use_videos):
"""Float (metre) and integer (millimetre) depth over the same physical range
yield identical millimetre-scale stats."""
from lerobot.datasets.lerobot_dataset import LeRobotDataset
def _record(depth_dtype, root):
features = features_factory(
camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos
)
dataset = LeRobotDataset.create(
repo_id=DUMMY_REPO_ID,
fps=DEFAULT_FPS,
features=features,
root=root,
use_videos=use_videos,
streaming_encoding=use_videos,
)
add_frames(dataset, num_frames=self.NUM_FRAMES, depth_dtype=depth_dtype)
dataset.save_episode()
dataset.finalize()
return np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)
# add_frames ramps float depth over 0.110 m and integer depth over 10010000 mm
# (the same physical range), so canonicalized stats must match.
mean_m = _record(np.float32, tmp_path / "ds_m")
mean_mm = _record(np.uint16, tmp_path / "ds_mm")
# Float (metre) input is scaled to millimetres, not left in the single-digit metre range.
assert mean_m.item() > 50.0
np.testing.assert_allclose(mean_m, mean_mm, rtol=0.05)
+12 -7
View File
@@ -49,16 +49,18 @@ from tests.fixtures.constants import (
) )
def add_frames(dataset: LeRobotDataset, num_frames: int) -> None: def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype = np.uint16) -> None:
"""Append ``num_frames`` synthetic frames to ``dataset``. """Append ``num_frames`` synthetic frames to ``dataset``.
Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for Generates per-feature payloads from ``dataset.meta``: depth ramps (``depth_dtype``,
keys in ``dataset.meta.depth_keys``, uint8 random noise for video/image keys, default ``uint16`` millimetres; pass ``np.float32`` for metres) for keys in
and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, ``dataset.meta.depth_keys``, uint8 random noise for video/image keys, and float32
frame_index, ...) are auto-populated by ``add_frame`` and skipped here. zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, frame_index, ...) are
auto-populated by ``add_frame`` and skipped here.
""" """
video_keys = dataset.meta.video_keys video_keys = dataset.meta.video_keys
depth_keys = dataset.meta.depth_keys depth_keys = dataset.meta.depth_keys
depth_is_float = np.issubdtype(depth_dtype, np.floating)
# Smooth gradient base reused per (H, W) to keep depth frames cheap to # Smooth gradient base reused per (H, W) to keep depth frames cheap to
# encode (HEVC Main 12 hates white noise). # encode (HEVC Main 12 hates white noise).
_depth_base_cache: dict[tuple[int, int], np.ndarray] = {} _depth_base_cache: dict[tuple[int, int], np.ndarray] = {}
@@ -70,11 +72,14 @@ def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
shape = ft["shape"] shape = ft["shape"]
if key in depth_keys: if key in depth_keys:
h, w, _ = shape h, w, _ = shape
# Float depth is expressed in metres, integer depth in millimetres.
lo, hi = (0.1, 10.0) if depth_is_float else (100.0, 10_000.0)
base = _depth_base_cache.setdefault( base = _depth_base_cache.setdefault(
(h, w), (h, w),
np.linspace(100.0, 10_000.0, h * w, dtype=np.float32).reshape(h, w, 1), np.linspace(lo, hi, h * w, dtype=np.float32).reshape(h, w, 1),
) )
frame[key] = (base + 50.0 * i).clip(0, 65535).astype(np.uint16) step = (0.05 if depth_is_float else 50.0) * i
frame[key] = (base + step).clip(0, 65535).astype(depth_dtype)
elif key in video_keys: elif key in video_keys:
frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8) frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
else: else: