fix(lossless): tuning depth encoding parameters for lossless depth storage

2026-06-30 22:57:00 +00:00 · 2026-06-24 22:41:00 +02:00
parent 96e04e5d05
commit 24c0bf6199
3 changed files with 15 additions and 13 deletions
@@ -100,14 +100,15 @@ lerobot-record \
    --dataset.depth_encoder.use_log=true
 ```

-| Parameter   | Type    | Default      | Description                                                                                                                       |
-| ----------- | ------- | ------------ | --------------------------------------------------------------------------------------------------------------------------------- |
-| `vcodec`    | `str`   | `"hevc"`     | Defaults to HEVC Main 12 (a 12-bit-capable codec). For a lossless depth stream, keep HEVC and pass `extra_options={"x265-params": "lossless=1"}` (stays MP4-compatible). |
-| `pix_fmt`   | `str`   | `"gray12le"` | Single-channel 12-bit pixel format used to carry the quantized codes.                                                             |
-| `depth_min` | `float` | `0.01`       | Depth in metres mapped to quantum `0`. Values below are clipped on decode.                                                        |
-| `depth_max` | `float` | `10.0`       | Depth in metres mapped to quantum `4095`. Values above are clipped on decode.                                                     |
-| `shift`     | `float` | `3.5`        | Pre-log offset (metres) used in logarithmic quantization for numerical stability near zero. Must satisfy `depth_min + shift > 0`. |
-| `use_log`   | `bool`  | `True`       | If `true`, quantize in log-space (recommended for typical depth sensors). Set to `false` for uniform/linear quantization.         |
+| Parameter       | Type    | Default                         | Description                                                                                                                            |
+| --------------- | ------- | ------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `vcodec`        | `str`   | `"hevc"`                        | HEVC Main 12 (a 12-bit-capable codec, MP4-compatible).                                                                                 |
+| `extra_options` | `dict`  | `{"x265-params": "lossless=1"}` | **Depth defaults to lossless** (exact round-trip); `crf` is ignored. Pass `extra_options={}` and set `crf` for a smaller lossy stream. |
+| `pix_fmt`       | `str`   | `"gray12le"`                    | Single-channel 12-bit pixel format used to carry the quantized codes.                                                                  |
+| `depth_min`     | `float` | `0.01`                          | Depth in metres mapped to quantum `0`. Values below are clipped on decode.                                                             |
+| `depth_max`     | `float` | `10.0`                          | Depth in metres mapped to quantum `4095`. Values above are clipped on decode.                                                          |
+| `shift`         | `float` | `3.5`                           | Pre-log offset (metres) used in logarithmic quantization for numerical stability near zero. Must satisfy `depth_min + shift > 0`.      |
+| `use_log`       | `bool`  | `True`                          | If `true`, quantize in log-space (recommended for typical depth sensors). Set to `false` for uniform/linear quantization.              |

 > [!TIP]
 > `depth_min`, `depth_max`, and `shift` are always interpreted in **metres**, regardless of the input depth's unit. Inputs are auto-detected: integer arrays (e.g. `uint16` millimetres straight from a RealSense) are treated as millimetres, floating arrays as metres.
@@ -36,9 +36,7 @@ HW_VIDEO_CODECS = [
    "h264_vaapi",  # Linux Intel/AMD
    "h264_qsv",  # Intel Quick Sync
 ]
-VALID_VIDEO_CODECS: frozenset[str] = frozenset(
-    {"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS}
-)
+VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
 # Aliases for legacy video codec names.
 VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}

@@ -272,8 +270,9 @@ class DepthEncoderConfig(VideoEncoderConfig):
    ``"gray12le"``.
    """

-    vcodec: str = "hevc"
-    pix_fmt: str = "gray12le"
+    vcodec: str = "hevc"  # Video codec name. Defaults to HEVC Main 12 (a 12-bit-capable codec).
+    pix_fmt: str = "gray12le"  # Pixel format. Defaults to 12-bit grayscale.
+    extra_options: dict[str, Any] = field(default_factory=lambda: {"x265-params": "lossless=1"})

    depth_min: float = DEFAULT_DEPTH_MIN  # Minimum depth in meters, mapped to the lowest quantum.
    depth_max: float = DEFAULT_DEPTH_MAX  # Maximum depth in meters, mapped to the highest quantum.
@@ -730,6 +730,7 @@ class TestEncodeDepthVideoFrames:
            pix_fmt="gray12le",
            g=4,
            crf=25,
+            extra_options={},
            depth_min=0.05,
            depth_max=8.0,
            shift=2.5,
@@ -777,6 +778,7 @@ class TestDepthEncoderConfigPersistence:
            pix_fmt="gray12le",
            g=2,
            crf=30,
+            extra_options={},
            depth_min=0.05,
            depth_max=8.0,
            shift=2.5,