mirror of
https://github.com/huggingface/lerobot.git
synced 2026-07-01 15:17:05 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d7be868dfa |
@@ -134,9 +134,6 @@ lerobot-train \
|
||||
> [!TIP]
|
||||
> This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Depth statistics in `meta/stats.json` are always computed in **millimetres**, regardless of the raw frame dtype.
|
||||
|
||||
---
|
||||
|
||||
## Persistence in dataset metadata
|
||||
|
||||
@@ -36,7 +36,9 @@ HW_VIDEO_CODECS = [
|
||||
"h264_vaapi", # Linux Intel/AMD
|
||||
"h264_qsv", # Intel Quick Sync
|
||||
]
|
||||
VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
|
||||
VALID_VIDEO_CODECS: frozenset[str] = frozenset(
|
||||
{"h264", "hevc", "libsvtav1", "libaom-av1", "auto", *HW_VIDEO_CODECS}
|
||||
)
|
||||
# Aliases for legacy video codec names.
|
||||
VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
|
||||
|
||||
@@ -220,6 +222,12 @@ class VideoEncoderConfig:
|
||||
if self.fast_decode:
|
||||
opts["tune"] = "fastdecode"
|
||||
set_if("threads", encoder_threads)
|
||||
elif self.vcodec == "libaom-av1":
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
if encoder_threads is not None:
|
||||
opts["threads"] = encoder_threads
|
||||
opts["row-mt"] = 1
|
||||
elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
|
||||
if self.crf is not None:
|
||||
opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
|
||||
|
||||
@@ -22,7 +22,6 @@ import numpy as np
|
||||
from lerobot.processor import RelativeActionsProcessorStep
|
||||
from lerobot.utils.constants import ACTION, OBS_STATE
|
||||
|
||||
from .depth_utils import MM_PER_METRE
|
||||
from .io_utils import load_image_as_numpy
|
||||
|
||||
DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99]
|
||||
@@ -509,8 +508,8 @@ def compute_episode_stats(
|
||||
Note:
|
||||
For 'image'/'video' features, stats are computed per channel and kept with a
|
||||
leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
|
||||
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) are
|
||||
instead canonicalized to millimetres regardless of the raw frame unit.
|
||||
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
|
||||
this rescaling and remain in their stored units.
|
||||
"""
|
||||
if quantile_list is None:
|
||||
quantile_list = DEFAULT_QUANTILES
|
||||
@@ -534,14 +533,9 @@ def compute_episode_stats(
|
||||
)
|
||||
|
||||
if features[key]["dtype"] in ["image", "video"]:
|
||||
if (features[key].get("info") or {}).get("is_depth_map", False):
|
||||
# Depth stats are canonically stored in millimetres; metre (float) depth is
|
||||
# scaled up, integer (millimetre) depth is left as-is.
|
||||
normalization_factor = (
|
||||
1.0 / MM_PER_METRE if np.issubdtype(ep_ft_array.dtype, np.floating) else 1.0
|
||||
)
|
||||
else:
|
||||
normalization_factor = 255.0
|
||||
normalization_factor = (
|
||||
255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0
|
||||
)
|
||||
ep_stats[key] = {
|
||||
k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0)
|
||||
for k, v in ep_stats[key].items()
|
||||
|
||||
@@ -39,7 +39,7 @@ from lerobot.configs.video import (
|
||||
from .image_writer import squeeze_single_channel
|
||||
from .pyav_utils import write_u16_plane
|
||||
|
||||
MM_PER_METRE = 1000.0
|
||||
_MM_PER_METRE = 1000.0
|
||||
_UINT16_MAX = 65535
|
||||
|
||||
|
||||
@@ -126,12 +126,12 @@ def quantize_depth(
|
||||
|
||||
# Convert depth_min, depth_max, and shift to the resolved input unit.
|
||||
depth_min_u = (
|
||||
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
|
||||
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
|
||||
)
|
||||
depth_max_u = (
|
||||
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
|
||||
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
|
||||
)
|
||||
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
|
||||
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
|
||||
|
||||
# Normalization and quantization is performed in the resolved input unit.
|
||||
if use_log:
|
||||
@@ -236,7 +236,7 @@ def dequantize_depth(
|
||||
|
||||
# mm path: round + clamp in float32, skipping the uint16 round-trip
|
||||
# when returning a tensor (torch.uint16 is poorly supported).
|
||||
buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
|
||||
buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
|
||||
if output_tensor:
|
||||
return buf
|
||||
return buf.cpu().numpy().astype(np.uint16, copy=False)
|
||||
@@ -259,7 +259,7 @@ def dequantize_depth(
|
||||
if output_unit == DEPTH_METER_UNIT:
|
||||
return torch.from_numpy(buf) if output_tensor else buf
|
||||
|
||||
np.multiply(buf, MM_PER_METRE, out=buf)
|
||||
np.multiply(buf, _MM_PER_METRE, out=buf)
|
||||
np.rint(buf, out=buf)
|
||||
np.clip(buf, 0.0, _UINT16_MAX, out=buf)
|
||||
if output_tensor:
|
||||
|
||||
@@ -47,7 +47,7 @@ from lerobot.configs import (
|
||||
)
|
||||
from lerobot.utils.import_utils import get_safe_default_video_backend
|
||||
|
||||
from .depth_utils import MM_PER_METRE, quantize_depth
|
||||
from .depth_utils import quantize_depth
|
||||
from .pyav_utils import get_pix_fmt_channels
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -848,9 +848,6 @@ class _CameraEncoderThread(threading.Thread):
|
||||
# Reshape CHW to (H*W, C) for per-channel stats
|
||||
channels = img_downsampled.shape[0]
|
||||
img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
|
||||
# Depth stats are canonically stored in millimetres; metre (float) depth is scaled up.
|
||||
if self.is_depth and np.issubdtype(frame_data.dtype, np.floating):
|
||||
img_for_stats = img_for_stats * MM_PER_METRE
|
||||
stats_tracker.update(img_for_stats)
|
||||
|
||||
frame_count += 1
|
||||
|
||||
@@ -1531,6 +1531,7 @@ def test_valid_video_codecs_constant():
|
||||
assert "h264" in VALID_VIDEO_CODECS
|
||||
assert "hevc" in VALID_VIDEO_CODECS
|
||||
assert "libsvtav1" in VALID_VIDEO_CODECS
|
||||
assert "libaom-av1" in VALID_VIDEO_CODECS
|
||||
assert "auto" in VALID_VIDEO_CODECS
|
||||
assert "h264_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "h264_nvenc" in VALID_VIDEO_CODECS
|
||||
@@ -1538,7 +1539,7 @@ def test_valid_video_codecs_constant():
|
||||
assert "h264_qsv" in VALID_VIDEO_CODECS
|
||||
assert "hevc_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "hevc_nvenc" in VALID_VIDEO_CODECS
|
||||
assert len(VALID_VIDEO_CODECS) == 10
|
||||
assert len(VALID_VIDEO_CODECS) == 11
|
||||
|
||||
|
||||
def test_delta_timestamps_with_episodes_filter(tmp_path, empty_lerobot_dataset_factory):
|
||||
|
||||
@@ -245,44 +245,3 @@ class TestFeatureFileRouting:
|
||||
|
||||
dataset.save_episode()
|
||||
dataset.finalize()
|
||||
|
||||
|
||||
# ── 5. Depth stats unit canonicalization (millimetres) ────────────────
|
||||
|
||||
|
||||
class TestDepthStatsUnit:
|
||||
"""Depth stats are always stored in millimetres, regardless of raw frame dtype."""
|
||||
|
||||
NUM_FRAMES = 4
|
||||
|
||||
@pytest.mark.parametrize("use_videos", [False, True])
|
||||
def test_stats_canonicalized_to_mm(self, tmp_path, features_factory, use_videos):
|
||||
"""Float (metre) and integer (millimetre) depth over the same physical range
|
||||
yield identical millimetre-scale stats."""
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
def _record(depth_dtype, root):
|
||||
features = features_factory(
|
||||
camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos
|
||||
)
|
||||
dataset = LeRobotDataset.create(
|
||||
repo_id=DUMMY_REPO_ID,
|
||||
fps=DEFAULT_FPS,
|
||||
features=features,
|
||||
root=root,
|
||||
use_videos=use_videos,
|
||||
streaming_encoding=use_videos,
|
||||
)
|
||||
add_frames(dataset, num_frames=self.NUM_FRAMES, depth_dtype=depth_dtype)
|
||||
dataset.save_episode()
|
||||
dataset.finalize()
|
||||
return np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)
|
||||
|
||||
# add_frames ramps float depth over 0.1–10 m and integer depth over 100–10000 mm
|
||||
# (the same physical range), so canonicalized stats must match.
|
||||
mean_m = _record(np.float32, tmp_path / "ds_m")
|
||||
mean_mm = _record(np.uint16, tmp_path / "ds_mm")
|
||||
|
||||
# Float (metre) input is scaled to millimetres, not left in the single-digit metre range.
|
||||
assert mean_m.item() > 50.0
|
||||
np.testing.assert_allclose(mean_m, mean_mm, rtol=0.05)
|
||||
|
||||
Vendored
+7
-12
@@ -49,18 +49,16 @@ from tests.fixtures.constants import (
|
||||
)
|
||||
|
||||
|
||||
def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype = np.uint16) -> None:
|
||||
def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
|
||||
"""Append ``num_frames`` synthetic frames to ``dataset``.
|
||||
|
||||
Generates per-feature payloads from ``dataset.meta``: depth ramps (``depth_dtype``,
|
||||
default ``uint16`` millimetres; pass ``np.float32`` for metres) for keys in
|
||||
``dataset.meta.depth_keys``, uint8 random noise for video/image keys, and float32
|
||||
zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, frame_index, ...) are
|
||||
auto-populated by ``add_frame`` and skipped here.
|
||||
Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for
|
||||
keys in ``dataset.meta.depth_keys``, uint8 random noise for video/image keys,
|
||||
and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp,
|
||||
frame_index, ...) are auto-populated by ``add_frame`` and skipped here.
|
||||
"""
|
||||
video_keys = dataset.meta.video_keys
|
||||
depth_keys = dataset.meta.depth_keys
|
||||
depth_is_float = np.issubdtype(depth_dtype, np.floating)
|
||||
# Smooth gradient base reused per (H, W) to keep depth frames cheap to
|
||||
# encode (HEVC Main 12 hates white noise).
|
||||
_depth_base_cache: dict[tuple[int, int], np.ndarray] = {}
|
||||
@@ -72,14 +70,11 @@ def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype =
|
||||
shape = ft["shape"]
|
||||
if key in depth_keys:
|
||||
h, w, _ = shape
|
||||
# Float depth is expressed in metres, integer depth in millimetres.
|
||||
lo, hi = (0.1, 10.0) if depth_is_float else (100.0, 10_000.0)
|
||||
base = _depth_base_cache.setdefault(
|
||||
(h, w),
|
||||
np.linspace(lo, hi, h * w, dtype=np.float32).reshape(h, w, 1),
|
||||
np.linspace(100.0, 10_000.0, h * w, dtype=np.float32).reshape(h, w, 1),
|
||||
)
|
||||
step = (0.05 if depth_is_float else 50.0) * i
|
||||
frame[key] = (base + step).clip(0, 65535).astype(depth_dtype)
|
||||
frame[key] = (base + 50.0 * i).clip(0, 65535).astype(np.uint16)
|
||||
elif key in video_keys:
|
||||
frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user