Compare commits

...

6 Commits

10 changed files with 172 additions and 15 deletions
+4
View File
@@ -34,6 +34,8 @@ from .types import (
)
from .video import (
DEFAULT_DEPTH_UNIT,
DEPTH_METER_UNIT,
DEPTH_MILLIMETER_UNIT,
VALID_VIDEO_CODECS,
VIDEO_ENCODER_INFO_KEYS,
DepthEncoderConfig,
@@ -72,6 +74,8 @@ __all__ = [
"encoder_config_from_video_info",
# Constants
"DEFAULT_DEPTH_UNIT",
"DEPTH_METER_UNIT",
"DEPTH_MILLIMETER_UNIT",
"VALID_VIDEO_CODECS",
"VIDEO_ENCODER_INFO_KEYS",
]
+1 -1
View File
@@ -509,7 +509,7 @@ def compute_episode_stats(
For 'image'/'video' features, stats are computed per channel and kept with a
leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
this rescaling and remain in their stored units.
this rescaling and remain in their stored units (stored in ``depth_unit``).
"""
if quantile_list is None:
quantile_list = DEFAULT_QUANTILES
+31 -1
View File
@@ -26,12 +26,13 @@ import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub import snapshot_download
from lerobot.configs import VideoEncoderConfig
from lerobot.configs import DEPTH_METER_UNIT, VideoEncoderConfig
from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
from lerobot.utils.feature_utils import _validate_feature_names
from lerobot.utils.utils import flatten_dict
from .compute_stats import aggregate_stats
from .depth_utils import MM_PER_METRE
from .feature_utils import create_empty_dataset_info
from .io_utils import (
get_file_size_in_mb,
@@ -358,6 +359,35 @@ class LeRobotDatasetMetadata:
return [key for key, ft in self.features.items() if _is_depth(ft)]
def rescale_depth_stats(self, output_unit: str) -> None:
"""Rescale depth feature stats in place from their recorded unit to ``output_unit``.
Depth stats are stored in the unit the frames were recorded in
(``features[key]["info"]["depth_unit"]``), while frames are returned in
``output_unit`` on read. This converts the unit-bearing stat entries so
stats match the frames consumers see.
"""
missing_unit_keys = [
key for key in self.depth_keys if (self.features[key].get("info") or {}).get("depth_unit") is None
]
if missing_unit_keys:
logging.warning(
f"Depth feature(s) {missing_unit_keys} have no recorded 'depth_unit' in their info. "
f"Depth maps and stats for these keys will be returned AS IS, with no unit conversion "
f"to the requested output unit {output_unit!r}. Re-record the dataset or set 'depth_unit' "
f"in the feature info (meta/info.json) to enable conversion."
)
if self.stats is None:
return
for key in self.depth_keys:
stored_unit = (self.features[key].get("info") or {}).get("depth_unit")
if stored_unit is None or stored_unit == output_unit or key not in self.stats:
continue
factor = MM_PER_METRE if stored_unit == DEPTH_METER_UNIT else 1.0 / MM_PER_METRE
self.stats[key] = {
stat: value if stat == "count" else value * factor for stat, value in self.stats[key].items()
}
@property
def camera_keys(self) -> list[str]:
"""Keys to access visual modalities (regardless of their storage method)."""
+20 -2
View File
@@ -22,10 +22,14 @@ from pathlib import Path
import datasets
import torch
from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig
from lerobot.configs import (
DEFAULT_DEPTH_UNIT,
DEPTH_METER_UNIT,
DepthEncoderConfig,
)
from .dataset_metadata import LeRobotDatasetMetadata
from .depth_utils import dequantize_depth
from .depth_utils import MM_PER_METRE, dequantize_depth
from .feature_utils import (
check_delta_timestamps,
get_delta_indices,
@@ -102,6 +106,13 @@ class DatasetReader:
for vid_key in self._meta.depth_keys
}
# Get the input unit of each depth feature stored as raw images.
self._image_depth_units: dict[str, str | None] = {
key: (self._meta.features[key].get("info") or {}).get("depth_unit")
for key in self._meta.depth_keys
if key in self._meta.image_keys
}
def set_image_transforms(self, image_transforms: Callable | None) -> None:
"""Replace the transform applied to visual observations."""
if image_transforms is not None and not callable(image_transforms):
@@ -329,6 +340,13 @@ class DatasetReader:
continue
item[cam] = self._image_transforms(item[cam])
# Convert depth features to the output unit.
for key, stored_unit in self._image_depth_units.items():
if key in item and stored_unit is not None and stored_unit != self._depth_output_unit:
item[key] = (
item[key] * MM_PER_METRE if stored_unit == DEPTH_METER_UNIT else item[key] / MM_PER_METRE
)
# Add task as a string
task_idx = item["task_index"].item()
item["task"] = self._meta.tasks.iloc[task_idx].name
+10
View File
@@ -41,6 +41,7 @@ from lerobot.configs import (
from .compute_stats import compute_episode_stats
from .dataset_metadata import LeRobotDatasetMetadata
from .depth_utils import infer_depth_unit
from .feature_utils import (
get_hf_features_from_features,
validate_episode_buffer,
@@ -209,6 +210,15 @@ class DatasetWriter:
self.episode_buffer["timestamp"].append(timestamp)
self.episode_buffer["task"].append(frame.pop("task"))
# Record each depth feature's input unit once, inferred from the first frame's dtype.
if frame_index == 0:
for depth_key in self._meta.depth_keys:
if depth_key not in frame:
continue
info = self._meta.features[depth_key].setdefault("info", {})
if info.get("depth_unit") is None:
info["depth_unit"] = infer_depth_unit(np.asarray(frame[depth_key]).dtype)
# Start streaming encoder on first frame of episode
if frame_index == 0 and self._streaming_encoder is not None:
self._streaming_encoder.start_episode(
+15 -11
View File
@@ -39,10 +39,18 @@ from lerobot.configs.video import (
from .image_writer import squeeze_single_channel
from .pyav_utils import write_u16_plane
_MM_PER_METRE = 1000.0
MM_PER_METRE = 1000.0
_UINT16_MAX = 65535
def infer_depth_unit(dtype: np.dtype | type) -> str:
"""Infer the physical unit of raw depth frames from their dtype.
Floating-point frames are assumed to be in metres, integer frames in millimetres.
"""
return DEPTH_METER_UNIT if np.issubdtype(np.dtype(dtype), np.floating) else DEPTH_MILLIMETER_UNIT
def _validate_log_quant_params(depth_min: float, shift: float) -> None:
"""Ensure ``log(depth_min + shift)`` is finite."""
if depth_min + shift <= 0:
@@ -57,11 +65,7 @@ def _depth_input_to_float32_and_unit(
input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT],
) -> tuple[NDArray[np.float32], Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]]:
"""Convert depth to float32 in the chosen unit, and return the resolved unit."""
resolved_unit = (
(DEPTH_METER_UNIT if np.issubdtype(depth.dtype, np.floating) else DEPTH_MILLIMETER_UNIT)
if input_unit == "auto"
else input_unit
)
resolved_unit = infer_depth_unit(depth.dtype) if input_unit == "auto" else input_unit
return depth.astype(np.float32, order="K"), resolved_unit
@@ -126,12 +130,12 @@ def quantize_depth(
# Convert depth_min, depth_max, and shift to the resolved input unit.
depth_min_u = (
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
)
depth_max_u = (
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
)
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
# Normalization and quantization is performed in the resolved input unit.
if use_log:
@@ -236,7 +240,7 @@ def dequantize_depth(
# mm path: round + clamp in float32, skipping the uint16 round-trip
# when returning a tensor (torch.uint16 is poorly supported).
buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
if output_tensor:
return buf
return buf.cpu().numpy().astype(np.uint16, copy=False)
@@ -259,7 +263,7 @@ def dequantize_depth(
if output_unit == DEPTH_METER_UNIT:
return torch.from_numpy(buf) if output_tensor else buf
np.multiply(buf, _MM_PER_METRE, out=buf)
np.multiply(buf, MM_PER_METRE, out=buf)
np.rint(buf, out=buf)
np.clip(buf, 0.0, _UINT16_MAX, out=buf)
if output_tensor:
+1
View File
@@ -224,6 +224,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
)
self.root = self.meta.root
self.revision = self.meta.revision
self.meta.rescale_depth_stats(self._depth_output_unit)
if episodes is not None and any(
episode >= self.meta.total_episodes or episode < 0 for episode in episodes
@@ -310,6 +310,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
)
self.root = self.meta.root
self.revision = self.meta.revision
self.meta.rescale_depth_stats(self._depth_output_unit)
# Check version
check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
+81
View File
@@ -32,6 +32,7 @@ from lerobot.configs.video import (
)
from lerobot.datasets.depth_utils import dequantize_depth, quantize_depth
from lerobot.datasets.image_writer import image_array_to_pil_image, write_image
from lerobot.utils.constants import DEFAULT_FEATURES
from tests.fixtures.constants import (
DEFAULT_FPS,
DUMMY_CAMERA_FEATURES,
@@ -245,3 +246,83 @@ class TestFeatureFileRouting:
dataset.save_episode()
dataset.finalize()
class TestDepthUnitMetadata:
"""The depth unit is inferred once from dtype, stored in ``info``, and drives stats + reads."""
NUM_FRAMES = 4
def _record(self, root, features_factory, depth_dtype, value, use_videos):
from lerobot.datasets.lerobot_dataset import LeRobotDataset
features = features_factory(camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos)
dataset = LeRobotDataset.create(
repo_id=DUMMY_REPO_ID,
fps=DEFAULT_FPS,
features=features,
root=root,
use_videos=use_videos,
streaming_encoding=use_videos,
)
for _ in range(self.NUM_FRAMES):
frame: dict = {"task": "test"}
for key, ft in dataset.meta.features.items():
if key in DEFAULT_FEATURES:
continue
if key in dataset.meta.depth_keys:
frame[key] = np.full(ft["shape"], value, dtype=depth_dtype)
elif key in dataset.meta.camera_keys:
frame[key] = np.random.randint(0, 256, ft["shape"], dtype=np.uint8)
else:
frame[key] = np.zeros(ft["shape"], dtype=np.float32)
dataset.add_frame(frame)
return dataset
@pytest.mark.parametrize("use_videos", [False, True])
@pytest.mark.parametrize(
("depth_dtype", "value", "expected_unit"),
[(np.float32, 2.0, DEPTH_METER_UNIT), (np.uint16, 2000, DEPTH_MILLIMETER_UNIT)],
)
def test_recorded_unit_inferred_persisted_and_kept_in_stats(
self, tmp_path, features_factory, use_videos, depth_dtype, value, expected_unit
):
"""Unit is inferred from the first frame's dtype, drives stats (raw, never canonicalized), and survives a reload."""
from lerobot.datasets.lerobot_dataset import LeRobotDataset
dataset = self._record(tmp_path / "ds", features_factory, depth_dtype, value, use_videos)
assert dataset.meta.features[DEPTH_KEY]["info"]["depth_unit"] == expected_unit
dataset.save_episode()
mean = float(np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)[0])
np.testing.assert_allclose(mean, value, rtol=0.05)
dataset.finalize()
reloaded = LeRobotDataset(repo_id=DUMMY_REPO_ID, root=tmp_path / "ds")
assert reloaded.meta.features[DEPTH_KEY]["info"]["depth_unit"] == expected_unit
@pytest.mark.parametrize("use_videos", [False, True])
@pytest.mark.parametrize(
("output_unit", "expected"),
[(DEPTH_MILLIMETER_UNIT, 2000.0), (DEPTH_METER_UNIT, 2.0)],
)
def test_read_honors_output_unit_for_frames_and_stats(
self, tmp_path, features_factory, use_videos, output_unit, expected
):
"""Reloading with a ``depth_output_unit`` converts metre frames (image mode) and rescales stats while preserving count."""
from lerobot.datasets.lerobot_dataset import LeRobotDataset
dataset = self._record(tmp_path / "ds", features_factory, np.float32, 2.0, use_videos=use_videos)
dataset.save_episode()
count = float(np.asarray(dataset.meta.stats[DEPTH_KEY]["count"]).reshape(-1)[0])
dataset.finalize()
read_dataset = LeRobotDataset(
repo_id=DUMMY_REPO_ID, root=tmp_path / "ds", depth_output_unit=output_unit
)
stats = read_dataset.meta.stats[DEPTH_KEY]
np.testing.assert_allclose(float(np.asarray(stats["mean"]).reshape(-1)[0]), expected, rtol=0.05)
np.testing.assert_allclose(float(np.asarray(stats["count"]).reshape(-1)[0]), count)
if not use_videos:
depth = read_dataset[0][DEPTH_KEY]
assert torch.allclose(depth, torch.full_like(depth, expected))
+8
View File
@@ -27,6 +27,7 @@ import torch
from datasets import Dataset
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
from lerobot.datasets.depth_utils import infer_depth_unit
from lerobot.datasets.feature_utils import get_hf_features_from_features
from lerobot.datasets.io_utils import flatten_dict, hf_transform_to_torch
from lerobot.datasets.lerobot_dataset import LeRobotDataset
@@ -535,6 +536,13 @@ def lerobot_dataset_factory(
chunks_size=chunks_size,
**info_kwargs,
)
# This synthetic path skips add_frame, so record the depth unit the writer would
# have stored (dummy depth is uint16) to keep ``depth_unit`` present in info.json.
# Reassign a fresh info dict to avoid mutating the shared feature constants.
for ft in info.features.values():
ft_info = ft.get("info")
if ft_info is not None and ft_info.get("is_depth_map") and "depth_unit" not in ft_info:
ft["info"] = {**ft_info, "depth_unit": infer_depth_unit(np.uint16)}
if stats is None:
stats = stats_factory(features=info.features)
if tasks is None: