From 3dd19d043e2f3fe5673b13ea0ebe4f31884c0797 Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Sat, 27 Jun 2026 14:21:21 +0200 Subject: [PATCH] feat(depth maps): adding support for depth in LeRobot (#3644) * feat(depth): add depth quantization helpers and tests * feat(video): add ffv1 to supported codecs * feat(depth): persist depth metadata * feat(depth): extend quantization tools to better fit the encoding/decoding pipeline * feat(depth): plumb DepthEncoderConfig through LeRobotDataset and DatasetWriter * feat(depth): wire StreamingVideoEncoder + writer to depth encoder * feat(depth): wire DatasetReader to decode_depth_frames * feat(cameras/realsense): expose async depth in metric meters * feat(features): route 2D camera shapes to observation.depth. * feat(robots/so_follower): emit + populate depth keys when use_depth * feat(record): plumb DepthEncoderConfig through lerobot-record * feat(viz): render depth observations as rr.DepthImage in Viridis * feat(depth maps writer): adding support for raw depth maps recording with image writer * chore(format): format code * feat(depth shape): ensuring depth maps shape is always including the channel * feat(is_depth): simplifying is_depth nested name + legacy support * fix(stop_event): fixing stop_event race condition in camera classes * fix(plumbing): fixing missing parts in the depth maps pipeline * chore(typos): fixing typos * test(fix): fixing exisiting tests to still work with latest features * tests(depth): adding new tests for depth integration validation * feat(pix_fmt channels): use PyAv to check get pixel formats number of channels * feat(refactor): refactor DepthEncoderConfig quantization pipeline, so that the methods do not live in the config class. Add pixel format - channels validation.Move the default pixel format for depth in the config file. * fix(pre-commit): fixing mutable defautl value * fix(info): fixing info metadata update when is_depth_map was set * tests(typos): fixing typos in tests * fix(realsense): fixing typo in realsense serial number * fix(normalization): restricting 255 normalization to non depth/uint8 images only * fix(typo): fixing typo * fix(TIFF): add missing quantization and cleanup for TIFF files * feat(batched dequantization): optimizing dequantize_depth for torch based batched dequantization * feat(tools): adding depth support in LeRobotDataset edition tools * test(aggregate): extending aggregation tests to depth frames * test(cleaning): cleaning up tests * fix(from_video_info): fixing early validation issue in from_video_info * fix(typo): fixing typo * fix(is_depth): adding missing doctrings and is_depth arguments in video decoding functions Co-authored-by: Wensi (Vince) Ai <59036629+wensi-ai@users.noreply.github.com> * fix(depth units): fixing depth units output for the realsense cameras * feat(output unit): adding support for output unit specification at dataset reading/training time Co-authored-by: Wensi (Vince) Ai <59036629+wensi-ai@users.noreply.github.com> * test(depth): cleaning up depth tests * test(depth encoding): updating and cleaning video/depth encoding tests * chore(format): formatting code * docs(depth): improving depth maps docs * test(fix): fixing depth tests * test(dataset tools): adding missing tests for new dataset edition tools features * chore(format): formatting code * fix(pyav check): fixing PyAV option validation for integer codec options by normalizing numeric values before calling `is_integer()` Co-authored-by: Wensi (Vince) Ai <59036629+wensi-ai@users.noreply.github.com> * docs(mermaid): fixing mermaid diagram * fix(rebase): rebase follow up corrections * feat(dataset tools): adding missing docstrings and features for depth fill support in dataset edition tools * docs(docstring): updating docstrings * docs(dataset tools): updating docs * fix(save images): fixing image saving in dataset tools * fix(update video info): fixing update video info logic to match the recording and editing use cases * test(reencode): fixing reencoding monkeypatch * fix(review): add Claude review * chore(format): format code * fix(update video info): ditching the differentiated approahces for video info update - video info are always updated unless for preserved keys. * chore(rebase): fixing rebase merge conflicts * test(visualization): fixing visualization tests * feat(docstrings): adding explicit docstring for encoding parameters. Docstrigns will now show up as description in the CLI --help. * feat(mm as default): adding a global DEFAULT_DEPTH_UNIT variable setting mm as default depth unit * fix(RGB <-> camera): renaming camera_encoder to rgb_encoder for clarity * chore(TODO): removing deprecated TODO * doc(write_u16_plane): improving docstrings for write_u16_plane * feat(units): adding constants for depth frames units (m and mm) * fix(spam): replacing spamming warning but a debug log * feat(leagcy metadata): adding automatic metadata update for legacy 'video.is_depth_map' feature * fix(copy&reindex): fixing metadat reshaping for single channel frames * fix(ImageNet): excluding dpeth frames from ImageNet stats * fix(PyAV container seek): fixing initial PyAV container seek to be robust againsy codec choice * feat(lerobot-dataset-viz): adding support for depth in lerobot-dataset-viz * fix(compress): removing rerun compression for DepthImages * fix(signle channel squeeze): fixing single channel squeezing * chore(format): format code * fix(streaming): adding support for dequantization in streaming_dataset.py * refactor(read depth): factorizing depth reading methods for realsense camera and adding support for depth-only usage * chore(renaming): fixing missed RGBEncoderConfig renamings * docs(renaming): reflecting renamings in a clearer way in the docs * chore(annotation): excluding depth from the annotation pipeline * feat(robots): adding depth support in compatible follower robots * feat(LeSadKiwi): excluding LeKiwi from depth support (for now) * chore(fail): removing misplaced file * chore(fail): removing misplaced file * fix(remove ffv1): removing ffv1 as it does not support MP4 * docs(cheat sheet): adding depth and video encoding to the cheat sheet * fix(lossless): tuning depth encoding parameters for lossless depth storage * test(fix): fixing failing tests * depth(ZMQ): excluding ZMQ from depth support * Revert "depth(ZMQ): excluding ZMQ from depth support" This reverts commit b95cf4e4c2bb1c188263bbcdcfbd6f3aea034ecb. * fix(image transforms): excluding depth frames from images transforms * fix(typo): typo * fix(stats): fixing stats computation for depth frames * fix(TIFF vs. pytorch): adding an extra uint16 to float32 conversion for depth maps stored as raw TIFF images * fix(typos): fixing typos * test(dtype): fixing stats computation typing tests --------- Signed-off-by: Steven Palma Co-authored-by: Wensi (Vince) Ai <59036629+wensi-ai@users.noreply.github.com> Co-authored-by: Steven Palma Co-authored-by: Wensi Ai --- docs/source/cameras.mdx | 8 + docs/source/cheat-sheet.mdx | 30 ++ docs/source/earthrover_mini_plus.mdx | 2 +- docs/source/groot.mdx | 2 +- docs/source/hope_jr.mdx | 4 +- docs/source/il_robots.mdx | 2 +- docs/source/lerobot-dataset-v3.mdx | 2 +- docs/source/reachy2.mdx | 4 +- docs/source/streaming_video_encoding.mdx | 40 +- docs/source/using_dataset_tools.mdx | 57 +- docs/source/video_encoding_parameters.mdx | 97 +++- .../annotations/steerable_pipeline/frames.py | 10 +- src/lerobot/async_inference/helpers.py | 5 +- src/lerobot/cameras/opencv/camera_opencv.py | 4 +- .../cameras/realsense/camera_realsense.py | 183 ++++--- .../realsense/configuration_realsense.py | 6 + src/lerobot/cameras/zmq/camera_zmq.py | 2 + src/lerobot/configs/__init__.py | 15 +- src/lerobot/configs/dataset.py | 8 +- src/lerobot/configs/default.py | 11 +- src/lerobot/configs/video.py | 159 ++++-- src/lerobot/datasets/compute_stats.py | 22 +- src/lerobot/datasets/dataset_metadata.py | 53 +- src/lerobot/datasets/dataset_reader.py | 30 +- src/lerobot/datasets/dataset_tools.py | 185 ++++--- src/lerobot/datasets/dataset_writer.py | 56 +- src/lerobot/datasets/depth_utils.py | 268 ++++++++++ src/lerobot/datasets/factory.py | 3 + src/lerobot/datasets/feature_utils.py | 2 +- src/lerobot/datasets/image_writer.py | 68 ++- src/lerobot/datasets/io_utils.py | 47 +- src/lerobot/datasets/lerobot_dataset.py | 58 ++- src/lerobot/datasets/pyav_utils.py | 51 +- src/lerobot/datasets/streaming_dataset.py | 47 +- src/lerobot/datasets/utils.py | 5 +- src/lerobot/datasets/video_utils.py | 239 ++++++--- src/lerobot/policies/utils.py | 3 +- src/lerobot/robots/hope_jr/hope_jr_arm.py | 26 +- src/lerobot/robots/hope_jr/hope_jr_hand.py | 26 +- .../robots/koch_follower/koch_follower.py | 26 +- src/lerobot/robots/lekiwi/lekiwi.py | 6 + src/lerobot/robots/lekiwi/lekiwi_client.py | 7 + .../robots/omx_follower/omx_follower.py | 26 +- .../openarm_follower/openarm_follower.py | 26 +- .../rebot_b601_follower.py | 26 +- src/lerobot/robots/so_follower/so_follower.py | 25 +- src/lerobot/robots/unitree_g1/unitree_g1.py | 16 +- src/lerobot/rollout/context.py | 6 +- src/lerobot/scripts/lerobot_dataset_viz.py | 44 +- src/lerobot/scripts/lerobot_edit_dataset.py | 53 +- src/lerobot/scripts/lerobot_record.py | 14 +- src/lerobot/scripts/lerobot_rollout.py | 6 +- src/lerobot/utils/feature_utils.py | 35 +- src/lerobot/utils/visualization_utils.py | 5 +- tests/annotations/test_frames.py | 5 +- tests/datasets/test_aggregate.py | 82 ++- tests/datasets/test_compute_stats.py | 39 +- tests/datasets/test_dataset_metadata.py | 49 +- tests/datasets/test_dataset_tools.py | 144 +++++- tests/datasets/test_dataset_writer.py | 14 +- tests/datasets/test_datasets.py | 4 + tests/datasets/test_depth.py | 247 +++++++++ tests/datasets/test_image_writer.py | 4 +- .../datasets/test_streaming_video_encoder.py | 45 +- tests/datasets/test_video_encoding.py | 487 +++++++++++++----- tests/fixtures/constants.py | 46 +- tests/fixtures/dataset_factories.py | 38 ++ tests/scripts/test_edit_dataset_parsing.py | 45 ++ tests/utils/test_visualization_utils.py | 9 +- 69 files changed, 2740 insertions(+), 679 deletions(-) create mode 100644 src/lerobot/datasets/depth_utils.py create mode 100644 tests/datasets/test_depth.py diff --git a/docs/source/cameras.mdx b/docs/source/cameras.mdx index 2dc2859dd..02714d591 100644 --- a/docs/source/cameras.mdx +++ b/docs/source/cameras.mdx @@ -157,6 +157,14 @@ finally: +### Working with depth + +The Intel RealSense and Reachy 2 cameras can capture both color and depth in lockstep. Calling `read()` returns the **color** frame as `(H, W, 3)` `uint8`. Calling `read_depth()` returns the **depth map** as `(H, W, 1)` `uint16`, where each pixel value is the distance from the sensor expressed in **millimetres**. A pixel value of `0` typically means "no measurement available" (out-of-range, occluded, or low-confidence). + +During recording, the control loop peeks the freshest buffered frames non-blockingly via `read_latest()` (color) and `read_latest_depth()` (depth), adding the depth map as a sibling feature (e.g. `front_depth` next to `front`). + +For how depth streams are stored and encoded when recording a dataset, see the [Depth streams](./video_encoding_parameters#depth-streams) section of the video encoding guide. + ## Use your phone's camera diff --git a/docs/source/cheat-sheet.mdx b/docs/source/cheat-sheet.mdx index a6afa14c2..45952c5b3 100644 --- a/docs/source/cheat-sheet.mdx +++ b/docs/source/cheat-sheet.mdx @@ -89,6 +89,36 @@ Control the data recording flow using keyboard shortcuts: - Press **Left Arrow (`←`)**: Delete current episode and retry. - Press **Escape (`ESC`)**: Stop, encode videos, and upload. +### Recording depth + +Intel RealSense cameras (`type: intelrealsense`) record a depth stream when you set `use_depth: true`. Depth is quantized to 12-bit codes and stored as its own video. + +```bash +lerobot-record \ + ... \ + --robot.cameras="{ head: {type: intelrealsense, serial_number_or_name: \"0123456789\", width: 640, height: 480, fps: 30, use_depth: true} }" \ + --dataset.repo_id=${HF_USER}/so101_depth_test \ + --dataset.single_task="put the red brick in a bowl" \ + --dataset.depth_encoder.depth_min=0.01 \ + --dataset.depth_encoder.depth_max=10.0 \ + --dataset.depth_encoder.shift=0.0 \ + --dataset.depth_encoder.use_log=true +``` + +### Video encoding parameters + +RGB and depth streams are encoded independently via the `--dataset.rgb_encoder.*` and `--dataset.depth_encoder.*` keys. + +```bash +lerobot-record \ + ... \ + --dataset.rgb_encoder.vcodec=h264 \ + --dataset.rgb_encoder.pix_fmt=yuv420p \ + --dataset.rgb_encoder.crf=23 \ + --dataset.depth_encoder.vcodec=hevc \ + --dataset.depth_encoder.extra_options='{"x265-params": "lossless=1"}' +``` + ### Training Depending on your hardware training the policy might take a few hours. That's how you train simple `ACT` policy: diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx index 508c0e3a9..f3b324093 100644 --- a/docs/source/earthrover_mini_plus.mdx +++ b/docs/source/earthrover_mini_plus.mdx @@ -194,7 +194,7 @@ lerobot-record \ --dataset.single_task="Navigate around obstacles" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx index a10b5e369..3ab202fb2 100644 --- a/docs/source/groot.mdx +++ b/docs/source/groot.mdx @@ -124,7 +124,7 @@ lerobot-rollout\ --dataset.single_task="Grab and handover the red cube to the other arm" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --policy.path=/groot-bimanual \ # your trained model --duration=600 ``` diff --git a/docs/source/hope_jr.mdx b/docs/source/hope_jr.mdx index 1f3b08fd7..c29a9f216 100644 --- a/docs/source/hope_jr.mdx +++ b/docs/source/hope_jr.mdx @@ -232,7 +232,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --display_data=true ``` @@ -278,6 +278,6 @@ lerobot-record \ --dataset.num_episodes=10 \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model ``` diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index 6a820e0db..0f14bd133 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -207,7 +207,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx index 21cb232d3..0647af0b0 100644 --- a/docs/source/lerobot-dataset-v3.mdx +++ b/docs/source/lerobot-dataset-v3.mdx @@ -44,7 +44,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx index 4b08569db..7f975af43 100644 --- a/docs/source/reachy2.mdx +++ b/docs/source/reachy2.mdx @@ -161,7 +161,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --display_data=true ``` @@ -203,7 +203,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.camera_encoder.vcodec=auto \ + # --dataset.rgb_encoder.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/streaming_video_encoding.mdx b/docs/source/streaming_video_encoding.mdx index 96e049eb3..0be32b717 100644 --- a/docs/source/streaming_video_encoding.mdx +++ b/docs/source/streaming_video_encoding.mdx @@ -17,7 +17,7 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti | Parameter | CLI Flag | Type | Default | Description | | ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- | | `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture | -| `vcodec` | `--dataset.camera_encoder.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | +| `vcodec` | `--dataset.rgb_encoder.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | | `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide | | `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `30` | Max buffered frames per camera (~1s at 30fps). Consumes RAM | @@ -82,15 +82,15 @@ Use HW encoding when: ### Available HW Encoders -| Encoder | Platform | Hardware | CLI Value | -| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | --------------------------------------------------- | -| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=h264_videotoolbox` | -| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=hevc_videotoolbox` | -| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=h264_nvenc` | -| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=hevc_nvenc` | -| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder.vcodec=h264_vaapi` | -| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder.vcodec=h264_qsv` | -| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder.vcodec=auto` | +| Encoder | Platform | Hardware | CLI Value | +| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------------------ | +| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.rgb_encoder.vcodec=h264_videotoolbox` | +| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.rgb_encoder.vcodec=hevc_videotoolbox` | +| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.rgb_encoder.vcodec=h264_nvenc` | +| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.rgb_encoder.vcodec=hevc_nvenc` | +| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.rgb_encoder.vcodec=h264_vaapi` | +| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.rgb_encoder.vcodec=h264_qsv` | +| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.rgb_encoder.vcodec=auto` | > [!NOTE] > In order to use the HW accelerated encoders you might need to upgrade your GPU drivers. @@ -100,15 +100,15 @@ Use HW encoding when: ## 5. Troubleshooting -| Symptom | Likely Cause | Fix | -| ------------------------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder.vcodec=auto`) | -| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder.vcodec=auto`). | -| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | -| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | -| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | -| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder.vcodec=auto` | -| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | +| Symptom | Likely Cause | Fix | +| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.rgb_encoder.vcodec=auto`) | +| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.rgb_encoder.vcodec=auto`). | +| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | +| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | +| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | +| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.rgb_encoder.vcodec=auto` | +| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | ## 6. Recommended Configurations @@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the # 2camsx 640x480x3 @30fps: Requires some tuning. # Use H.264, disable streaming, consider batching encoding -lerobot-record --dataset.camera_encoder.vcodec=h264 --dataset.streaming_encoding=false ... +lerobot-record --dataset.rgb_encoder.vcodec=h264 --dataset.streaming_encoding=false ... ``` ## 7. Closing note diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index 49247a6c1..e9299d298 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -11,8 +11,9 @@ LeRobot provides several utilities for manipulating datasets: 3. **Merge Datasets** - Combine multiple datasets into one. The datasets must have identical features, and episodes are concatenated in the order specified in `repo_ids` 4. **Add Features** - Add new features to a dataset 5. **Remove Features** - Remove features from a dataset -6. **Convert to Video** - Convert image-based datasets to video format for efficient storage -7. **Show the Info of Datasets** - Show the summary of datasets information such as number of episode etc. +6. **Convert to Video** - Convert image-based datasets to video format for efficient storage (RGB and depth cameras are encoded with separate encoders) +7. **Re-encode Videos** - Re-encode an existing video dataset's RGB and/or depth streams with new encoder settings +8. **Show the Info of Datasets** - Show the summary of datasets information such as number of episode etc. The core implementation is in `lerobot.datasets.dataset_tools`. An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`. @@ -117,10 +118,19 @@ lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ - --operation.camera_encoder.vcodec libsvtav1 \ - --operation.camera_encoder.pix_fmt yuv420p \ - --operation.camera_encoder.g 2 \ - --operation.camera_encoder.crf 30 + --operation.rgb_encoder.vcodec libsvtav1 \ + --operation.rgb_encoder.pix_fmt yuv420p \ + --operation.rgb_encoder.g 2 \ + --operation.rgb_encoder.crf 30 + +# Convert a dataset that includes depth maps, customizing the depth encoder +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_image_to_video \ + --operation.output_dir outputs/pusht_video \ + --operation.depth_encoder.depth_min 0.01 \ + --operation.depth_encoder.depth_max 10.0 \ + --operation.depth_encoder.use_log true # Convert only specific episodes lerobot-edit-dataset \ @@ -147,11 +157,42 @@ lerobot-edit-dataset \ **Parameters:** - `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`) -- `camera_encoder`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder.. See [Video Encoding Parameters](./video_encoding_parameters) for more details. +- `rgb_encoder`: Video encoder settings applied to RGB cameras — all sub-fields accessible via `--operation.rgb_encoder.`. See [Video Encoding Parameters](./video_encoding_parameters) for more details. +- `depth_encoder`: Video encoder settings applied to depth-map cameras (e.g. from an Intel RealSense). In addition to the standard encoder fields it exposes the depth quantization knobs (`depth_min`, `depth_max`, `shift`, `use_log`), accessible via `--operation.depth_encoder.`. These quantization settings are persisted to the dataset metadata so depth can be dequantized back to physical units on load. See the [Depth streams](./video_encoding_parameters#depth-streams) section for details. - `episode_indices`: List of specific episodes to convert (default: all episodes) - `num_workers`: Number of parallel workers for processing (default: 4) -**Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). All episodes, stats, and tasks are preserved. +**Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). Depth-map cameras are detected automatically and routed to the `depth_encoder`, while RGB cameras use the `rgb_encoder`. All episodes, stats, and tasks are preserved. + +#### Re-encode Videos + +Re-encode the videos of an existing video dataset with different encoder settings, without going back to raw frames. RGB videos use the `rgb_encoder` and depth videos use the `depth_encoder`. Provide only the encoder(s) you want to re-encode; the other stream type is left untouched. + +```bash +# Re-encode all RGB videos with new settings (saves to lerobot/pusht_reencoded by default) +lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --operation.type reencode_videos \ + --operation.rgb_encoder.vcodec h264 \ + --operation.rgb_encoder.pix_fmt yuv420p \ + --operation.rgb_encoder.crf 23 + +# Re-encode both RGB and depth videos in a dataset with depth maps +lerobot-edit-dataset \ + --repo_id lerobot/pusht_depth \ + --operation.type reencode_videos \ + --operation.rgb_encoder.vcodec h264 \ + --operation.depth_encoder.crf 50 +``` + +**Parameters:** + +- `rgb_encoder`: Encoder settings applied to every RGB video. Omit to skip re-encoding RGB videos. +- `depth_encoder`: Encoder settings applied to every depth video. Omit to skip re-encoding depth videos. +- `num_workers`: Number of parallel workers for processing. + +> [!NOTE] +> When re-encoding depth videos, the existing depth quantization parameters (`depth_min`, `depth_max`, `shift`, `use_log`) and the `is_depth_map` flag are **preserved** — re-encoding only changes the codec/quality of the stored stream, not how depth is dequantized on load. ### Show the information of datasets diff --git a/docs/source/video_encoding_parameters.mdx b/docs/source/video_encoding_parameters.mdx index 0b5b99b2b..132d25056 100644 --- a/docs/source/video_encoding_parameters.mdx +++ b/docs/source/video_encoding_parameters.mdx @@ -2,15 +2,15 @@ When video storage is enabled, LeRobot stores each camera stream as an **MP4** file instead of saving one image file per timestep. Video encoding compresses across time, which usually cuts dataset size and I/O compared to a pile of PNG, while keeping MP4 — a format every player and loader understands. -Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `camera_encoder`, a nested `VideoEncoderConfig` (`lerobot.configs.video.VideoEncoderConfig`) passed through PyAV. +Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `rgb_encoder`, a nested `RGBEncoderConfig` (`lerobot.configs.video.RGBEncoderConfig`) passed through PyAV. -You can set these parameters from the CLI with `--dataset.camera_encoder.` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run. +You can set these parameters from the CLI with `--dataset.rgb_encoder.` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run. - Video storage must be on for `camera_encoder` to have any effect — + Video storage must be on for `rgb_encoder` to have any effect — `use_videos=True` in Python APIs, or `--dataset.video=true` on the CLI (the - recording default). With video off, inputs stay as images and `camera_encoder` - is ignored. + recording default). With video off, inputs stay as images and `rgb_encoder` is + ignored. For details on **when** frames are written vs. encoded (streaming vs. post-episode), queues, and other top-level `--dataset.*` switches, see [Streaming Video Encoding](./streaming_video_encoding). For an encoding-parameter comparison and experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark). @@ -33,9 +33,9 @@ lerobot-record \ --dataset.single_task="Grab the cube" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - --dataset.camera_encoder.vcodec=h264 \ - --dataset.camera_encoder.preset=fast \ - --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \ + --dataset.rgb_encoder.vcodec=h264 \ + --dataset.rgb_encoder.preset=fast \ + --dataset.rgb_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \ --display_data=true ``` @@ -50,7 +50,7 @@ Only override these parameters if you have a specific reason to, and measure the -All flags below are prefixed with `--dataset.camera_encoder.` on the CLI. +All flags below are prefixed with `--dataset.rgb_encoder.` on the CLI. | Parameter | Type | Default | Description | | --------------- | ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -65,6 +65,77 @@ All flags below are prefixed with `--dataset.camera_encoder.` on the CLI. --- +## Depth streams + +Depth maps (Intel RealSense, Reachy 2) are stored as their **own video streams** alongside the RGB streams. Raw depth (`uint16` millimetres or `float32` metres) can't survive an 8-bit codec, so LeRobot **quantizes** each map to a 12-bit code (`[0, 4095]`) — logarithmically by default, to match the `1/depth` error profile of depth sensors — then packs it into a high-bit-depth pixel format (`gray12le`) and encodes it with a 12-bit codec. + +```mermaid +flowchart LR + A["Raw depth (uint16 mm / float32 m)"] --> B["Clip to depth_min, depth_max"] + B --> C["Quantize to 12-bit code 0–4095 (log or linear)"] + C --> D["Pack into gray12le"] + D --> E["Encode video (hevc Main 12)"] + E --> F[("MP4 + metadata: depth_min/max, shift, use_log")] + F -. "load time (depth_output_unit)" .-> G["Dequantize to mm or m"] + + classDef input fill:#e3f2fd,stroke:#1565c0,color:#0d47a1; + classDef encode fill:#ede7f6,stroke:#5e35b1,color:#311b92; + classDef store fill:#fff8e1,stroke:#f9a825,color:#e65100; + classDef load fill:#e8f5e9,stroke:#2e7d32,color:#1b5e20; + + class A input; + class B,C,D,E encode; + class F store; + class G load; +``` + +Configure the depth pipeline through a parallel **`depth_encoder`** block (`DepthEncoderConfig`). It shares every `RGBEncoderConfig` field (`vcodec`, `pix_fmt`, `crf`, …) and adds four quantizer knobs, set via `--dataset.depth_encoder.`: + +```bash +lerobot-record \ + ... \ + --dataset.depth_encoder.vcodec=hevc \ + --dataset.depth_encoder.depth_min=0.05 \ + --dataset.depth_encoder.depth_max=5.0 \ + --dataset.depth_encoder.use_log=true +``` + +| Parameter | Type | Default | Description | +| --------------- | ------- | ------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| `vcodec` | `str` | `"hevc"` | HEVC Main 12 (a 12-bit-capable codec, MP4-compatible). | +| `extra_options` | `dict` | `{"x265-params": "lossless=1"}` | **Depth defaults to lossless** (exact round-trip); `crf` is ignored. Pass `extra_options={}` and set `crf` for a smaller lossy stream. | +| `pix_fmt` | `str` | `"gray12le"` | Single-channel 12-bit pixel format used to carry the quantized codes. | +| `depth_min` | `float` | `0.01` | Depth in metres mapped to quantum `0`. Values below are clipped on decode. | +| `depth_max` | `float` | `10.0` | Depth in metres mapped to quantum `4095`. Values above are clipped on decode. | +| `shift` | `float` | `3.5` | Pre-log offset (metres) used in logarithmic quantization for numerical stability near zero. Must satisfy `depth_min + shift > 0`. | +| `use_log` | `bool` | `True` | If `true`, quantize in log-space (recommended for typical depth sensors). Set to `false` for uniform/linear quantization. | + +> [!TIP] +> `depth_min`, `depth_max`, and `shift` are always interpreted in **metres**, regardless of the input depth's unit. Inputs are auto-detected: integer arrays (e.g. `uint16` millimetres straight from a RealSense) are treated as millimetres, floating arrays as metres. +> Pick `depth_min` / `depth_max` to bracket the actual working range of your sensor — quanta outside that range saturate, which can crush detail at the boundaries. + +Depth features are flagged with `"is_depth_map": true` in `meta/info.json`, and their quantizer settings (`video.depth_min`, `video.depth_max`, `video.shift`, `video.use_log`) are persisted — which is what lets depth be **dequantized back to physical units** on load. + +### Output unit at load time + +`depth_encoder` is a **record-time** concern. The unit that depth maps are dequantized to on _load_ (e.g. during training) is set separately by the read-time flag `--dataset.depth_output_unit`: + +```bash +lerobot-train \ + --dataset.repo_id=/ \ + --dataset.depth_output_unit=m \ + --policy.type=act +``` + +| Parameter | Type | Default | Description | +| ------------------- | ----- | ------- | -------------------------------------------------------------------------------------------- | +| `depth_output_unit` | `str` | `"mm"` | Physical unit depth maps are dequantized to on load: `"mm"` (millimetres) or `"m"` (metres). | + +> [!TIP] +> This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras. + +--- + ## Persistence in dataset metadata After the first episode of a video stream is encoded, the encoder configuration is **persisted into the dataset metadata** (`meta/info.json`) under each video feature, alongside the values probed from the file itself. For a video feature `observation.images.`, the layout in `info.json` is: @@ -82,7 +153,7 @@ After the first episode of a video stream is encoded, the encoder configuration "video.pix_fmt": "yuv420p", "video.fps": 30, "video.channels": 3, - "video.is_depth_map": false, + "is_depth_map": false, "video.g": 2, "video.crf": 30, "video.preset": "fast", @@ -97,12 +168,12 @@ After the first episode of a video stream is encoded, the encoder configuration Two sources contribute to the `info` block: -- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present. -- **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`. +- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `is_depth_map`, plus `audio.*` if an audio stream is present. +- **Encoder-derived** (taken from `RGBEncoderConfig` or `DepthEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`. This block is populated **once**, from the **first** episode. It assumes every - episode in the dataset was encoded with the same `camera_encoder`. Changing + episode in the dataset was encoded with the same `rgb_encoder`. Changing encoder settings partway through a recording is not supported — the `info.json` will only reflect the parameters used for the first episode. diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py index a6c904673..5a6a5879c 100644 --- a/src/lerobot/annotations/steerable_pipeline/frames.py +++ b/src/lerobot/annotations/steerable_pipeline/frames.py @@ -36,7 +36,7 @@ from typing import Any, Protocol import PIL.Image import torch -from lerobot.configs.video import VideoEncoderConfig +from lerobot.configs import RGBEncoderConfig from lerobot.datasets.video_utils import decode_video_frames, reencode_video from .reader import EpisodeRecord, snap_to_frame @@ -164,7 +164,9 @@ class VideoFrameProvider: # only for video-stored cameras. Image-stored cameras (also in # ``camera_keys``) would KeyError, so restrict the list — and the # default — to video keys. - keys = list(self._meta.video_keys) + # Depth cameras are excluded from the annotation pipeline for now. + depth_keys = set(self._meta.depth_keys) + keys = [key for key in self._meta.video_keys if key not in depth_keys] # Last-resort fallback: if metadata didn't surface any video keys but # the caller explicitly named a camera (``--vlm.camera_key=...``), # trust them — the key is by definition known to exist on the dataset. @@ -276,12 +278,12 @@ class VideoFrameProvider: from_timestamp = float(ep[f"videos/{self.camera_key}/from_timestamp"]) to_timestamp = float(ep[f"videos/{self.camera_key}/to_timestamp"]) src = self.root / self._meta.get_video_file_path(record.episode_index, self.camera_key) - encoder = VideoEncoderConfig(vcodec="h264", pix_fmt="yuv420p", g=None, crf=23, preset="ultrafast") + encoder = RGBEncoderConfig(vcodec="h264", pix_fmt="yuv420p", g=None, crf=23, preset="ultrafast") try: reencode_video( src, out_path, - camera_encoder=encoder, + video_encoder=encoder, overwrite=True, start_time_s=from_timestamp, end_time_s=to_timestamp, diff --git a/src/lerobot/async_inference/helpers.py b/src/lerobot/async_inference/helpers.py index 4931c68c5..54f0ca69f 100644 --- a/src/lerobot/async_inference/helpers.py +++ b/src/lerobot/async_inference/helpers.py @@ -105,8 +105,9 @@ def raw_observation_to_observation( def prepare_image(image: torch.Tensor) -> torch.Tensor: - """Minimal preprocessing to turn int8 images to float32 in [0, 1], and create a memory-contiguous tensor""" - image = image.type(torch.float32) / 255 + """Minimal preprocessing to turn RGB uint8 images to float32 in [0, 1], and create a memory-contiguous tensor""" + if image.dtype == torch.uint8: + image = image.type(torch.float32) / 255 image = image.contiguous() return image diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index b3c20e8dd..e50d24c01 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -436,7 +436,7 @@ class OpenCVCamera(Camera): Internal loop run by the background thread for asynchronous reading. On each iteration: - 1. Reads a color frame + 1. Reads a color frame (blocking call) 2. Stores result in latest_frame and updates timestamp (thread-safe) 3. Sets new_frame_event to notify listeners @@ -485,6 +485,8 @@ class OpenCVCamera(Camera): if self.thread is not None and self.thread.is_alive(): self.thread.join(timeout=2.0) + if self.thread.is_alive(): + logger.warning(f"{self} read thread did not terminate within timeout.") self.thread = None self.stop_event = None diff --git a/src/lerobot/cameras/realsense/camera_realsense.py b/src/lerobot/cameras/realsense/camera_realsense.py index 80008e9f9..29cb1e5e0 100644 --- a/src/lerobot/cameras/realsense/camera_realsense.py +++ b/src/lerobot/cameras/realsense/camera_realsense.py @@ -128,6 +128,7 @@ class RealSenseCamera(Camera): self.fps = config.fps self.color_mode = config.color_mode + self.use_rgb = config.use_rgb self.use_depth = config.use_depth self.warmup_s = config.warmup_s @@ -195,12 +196,15 @@ class RealSenseCamera(Camera): # NOTE(Steven/Caroline): Enforcing at least one second of warmup as RS cameras need a bit of time before the first read. If we don't wait, the first read from the warmup will raise. self.warmup_s = max(self.warmup_s, 1) + warmup_read = self.async_read if self.use_rgb else self.async_read_depth start_time = time.time() while time.time() - start_time < self.warmup_s: - self.async_read(timeout_ms=self.warmup_s * 1000) + warmup_read(timeout_ms=self.warmup_s * 1000) time.sleep(0.1) with self.frame_lock: - if self.latest_color_frame is None or self.use_depth and self.latest_depth_frame is None: + if (self.use_rgb and self.latest_color_frame is None) or ( + self.use_depth and self.latest_depth_frame is None + ): raise ConnectionError(f"{self} failed to capture frames during warmup.") logger.info(f"{self} connected.") @@ -268,13 +272,13 @@ class RealSenseCamera(Camera): ) if len(found_devices) > 1: - serial_numbers = [dev["serial_number"] for dev in found_devices] + serial_numbers = [dev["id"] for dev in found_devices] raise ValueError( f"Multiple RealSense cameras found with name '{name}'. " f"Please use a unique serial number instead. Found SNs: {serial_numbers}" ) - serial_number = str(found_devices[0]["serial_number"]) + serial_number = str(found_devices[0]["id"]) return serial_number def _configure_rs_pipeline_config(self, rs_config: Any) -> None: @@ -282,15 +286,17 @@ class RealSenseCamera(Camera): rs.config.enable_device(rs_config, self.serial_number) if self.width and self.height and self.fps: - rs_config.enable_stream( - rs.stream.color, self.capture_width, self.capture_height, rs.format.rgb8, self.fps - ) + if self.use_rgb: + rs_config.enable_stream( + rs.stream.color, self.capture_width, self.capture_height, rs.format.rgb8, self.fps + ) if self.use_depth: rs_config.enable_stream( rs.stream.depth, self.capture_width, self.capture_height, rs.format.z16, self.fps ) else: - rs_config.enable_stream(rs.stream.color) + if self.use_rgb: + rs_config.enable_stream(rs.stream.color) if self.use_depth: rs_config.enable_stream(rs.stream.depth) @@ -298,8 +304,9 @@ class RealSenseCamera(Camera): def _configure_capture_settings(self) -> None: """Sets fps, width, and height from device stream if not already configured. - Uses the color stream profile to update unset attributes. Handles rotation by - swapping width/height when needed. Original capture dimensions are always stored. + Uses the color stream profile (or the depth stream profile when the color + stream is disabled) to update unset attributes. Handles rotation by swapping + width/height when needed. Original capture dimensions are always stored. Raises: DeviceNotConnectedError: If device is not connected. @@ -308,7 +315,8 @@ class RealSenseCamera(Camera): if self.rs_profile is None: raise RuntimeError(f"{self}: rs_profile must be initialized before use.") - stream = self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile() + rs_stream = rs.stream.color if self.use_rgb else rs.stream.depth + stream = self.rs_profile.get_stream(rs_stream).as_video_stream_profile() if self.fps is None: self.fps = stream.fps() @@ -323,6 +331,14 @@ class RealSenseCamera(Camera): self.width, self.height = actual_width, actual_height self.capture_width, self.capture_height = actual_width, actual_height + def _read(self, read_depth: bool = False) -> NDArray[Any]: + """Shared helper for :meth:`read`/:meth:`read_depth`: wait for a fresh color or depth frame.""" + if self.thread is None or not self.thread.is_alive(): + raise RuntimeError(f"{self} read thread is not running.") + + self.new_frame_event.clear() + return self._async_read(timeout_ms=10000, read_depth=read_depth) + @check_if_not_connected def read_depth(self, timeout_ms: int = 200) -> NDArray[Any]: """ @@ -332,8 +348,8 @@ class RealSenseCamera(Camera): from the camera hardware via the RealSense pipeline. Returns: - np.ndarray: The depth map as a NumPy array (height, width) - of type `np.uint16` (raw depth values in millimeters) and rotation. + np.ndarray: The depth map as a NumPy array (height, width, 1) + of type `np.uint16` (raw depth values in millimeters). Raises: DeviceNotConnectedError: If the camera is not connected. @@ -349,20 +365,7 @@ class RealSenseCamera(Camera): f"Failed to capture depth frame '.read_depth()'. Depth stream is not enabled for {self}." ) - if self.thread is None or not self.thread.is_alive(): - raise RuntimeError(f"{self} read thread is not running.") - - self.new_frame_event.clear() - - _ = self.async_read(timeout_ms=10000) - - with self.frame_lock: - depth_map = self.latest_depth_frame - - if depth_map is None: - raise RuntimeError("No depth frame available. Ensure camera is streaming.") - - return depth_map + return self._read(read_depth=True) def _read_from_hardware(self): if self.rs_pipeline is None: @@ -405,12 +408,10 @@ class RealSenseCamera(Camera): f"{self} read() timeout_ms parameter is deprecated and will be removed in future versions." ) - if self.thread is None or not self.thread.is_alive(): - raise RuntimeError(f"{self} read thread is not running.") + if not self.use_rgb: + raise RuntimeError(f"{self}: cannot read color — camera was configured with use_rgb=False.") - self.new_frame_event.clear() - - frame = self.async_read(timeout_ms=10000) + frame = self._read() read_duration_ms = (time.perf_counter() - start_time) * 1e3 logger.debug(f"{self} read took: {read_duration_ms:.1f}ms") @@ -465,8 +466,8 @@ class RealSenseCamera(Camera): Internal loop run by the background thread for asynchronous reading. On each iteration: - 1. Reads a color frame with 500ms timeout - 2. Stores result in latest_frame and updates timestamp (thread-safe) + 1. Reads a color/depth frame (blocking call with 10s timeout) + 2. Stores result in latest_color_frame/latest_depth_frame and updates timestamp (thread-safe) 3. Sets new_frame_event to notify listeners Stops on DeviceNotConnectedError, logs other errors and continues. @@ -479,19 +480,24 @@ class RealSenseCamera(Camera): while not stop_event.is_set(): try: frame = self._read_from_hardware() - color_frame_raw = frame.get_color_frame() - color_frame = np.asanyarray(color_frame_raw.get_data()) - processed_color_frame = self._postprocess_image(color_frame) + + if self.use_rgb: + color_frame_raw = frame.get_color_frame() + color_frame = np.asanyarray(color_frame_raw.get_data()) + processed_color_frame = self._postprocess_image(color_frame) if self.use_depth: depth_frame_raw = frame.get_depth_frame() depth_frame = np.asanyarray(depth_frame_raw.get_data()) processed_depth_frame = self._postprocess_image(depth_frame, depth_frame=True) + if processed_depth_frame.ndim == 2: # (H, W) -> (H, W, 1) + processed_depth_frame = processed_depth_frame[..., np.newaxis] capture_time = time.perf_counter() with self.frame_lock: - self.latest_color_frame = processed_color_frame + if self.use_rgb: + self.latest_color_frame = processed_color_frame if self.use_depth: self.latest_depth_frame = processed_depth_frame self.latest_timestamp = capture_time @@ -523,6 +529,8 @@ class RealSenseCamera(Camera): if self.thread is not None and self.thread.is_alive(): self.thread.join(timeout=2.0) + if self.thread.is_alive(): # pragma: no cover + logger.warning(f"{self} read thread did not terminate within timeout.") self.thread = None self.stop_event = None @@ -533,7 +541,26 @@ class RealSenseCamera(Camera): self.latest_timestamp = None self.new_frame_event.clear() - # NOTE(Steven): Missing implementation for depth for now + def _async_read(self, timeout_ms: float, read_depth: bool = False) -> NDArray[Any]: + """Shared helper for :meth:`async_read`/:meth:`async_read_depth`: return the latest buffered frame.""" + if self.thread is None or not self.thread.is_alive(): + raise RuntimeError(f"{self} read thread is not running.") + + if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0): + raise TimeoutError( + f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. " + f"Read thread alive: {self.thread.is_alive()}." + ) + + with self.frame_lock: + frame = self.latest_depth_frame if read_depth else self.latest_color_frame + self.new_frame_event.clear() + + if frame is None: + raise RuntimeError(f"Internal error: Event set but no frame available for {self}.") + + return frame + @check_if_not_connected def async_read(self, timeout_ms: float = 200) -> NDArray[Any]: """ @@ -558,25 +585,31 @@ class RealSenseCamera(Camera): RuntimeError: If the background thread died unexpectedly or another error occurs. """ + if not self.use_rgb: + raise RuntimeError(f"{self}: cannot read color — camera was configured with use_rgb=False.") + + return self._async_read(timeout_ms=timeout_ms) + + def _read_latest(self, max_age_ms: int, read_depth: bool = False) -> NDArray[Any]: + """Shared helper for :meth:`read_latest`/:meth:`read_latest_depth`: peek the latest buffered frame.""" if self.thread is None or not self.thread.is_alive(): raise RuntimeError(f"{self} read thread is not running.") - if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0): - raise TimeoutError( - f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. " - f"Read thread alive: {self.thread.is_alive()}." - ) - with self.frame_lock: - frame = self.latest_color_frame - self.new_frame_event.clear() + frame = self.latest_depth_frame if read_depth else self.latest_color_frame + timestamp = self.latest_timestamp - if frame is None: - raise RuntimeError(f"Internal error: Event set but no frame available for {self}.") + if frame is None or timestamp is None: + raise RuntimeError(f"{self} has not captured any frames yet.") + + age_ms = (time.perf_counter() - timestamp) * 1e3 + if age_ms > max_age_ms: + raise TimeoutError( + f"{self} latest frame is too old: {age_ms:.1f} ms (max allowed: {max_age_ms} ms)." + ) return frame - # NOTE(Steven): Missing implementation for depth for now @check_if_not_connected def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]: """Return the most recent (color) frame captured immediately (Peeking). @@ -593,24 +626,48 @@ class RealSenseCamera(Camera): DeviceNotConnectedError: If the camera is not connected. RuntimeError: If the camera is connected but has not captured any frames yet. """ + if not self.use_rgb: + raise RuntimeError(f"{self}: cannot read color — camera was configured with use_rgb=False.") - if self.thread is None or not self.thread.is_alive(): - raise RuntimeError(f"{self} read thread is not running.") + return self._read_latest(max_age_ms=max_age_ms) - with self.frame_lock: - frame = self.latest_color_frame - timestamp = self.latest_timestamp + @check_if_not_connected + def async_read_depth(self, timeout_ms: float = 200) -> NDArray[np.uint16]: + """Read the latest depth frame asynchronously, in millimeters. - if frame is None or timestamp is None: - raise RuntimeError(f"{self} has not captured any frames yet.") + Mirrors :meth:`async_read` but returns the depth stream rather than the + color stream. Output is ``np.uint16`` of shape ``(H, W, 1)``, where each + pixel is the distance from the sensor in millimeters. - age_ms = (time.perf_counter() - timestamp) * 1e3 - if age_ms > max_age_ms: - raise TimeoutError( - f"{self} latest frame is too old: {age_ms:.1f} ms (max allowed: {max_age_ms} ms)." - ) + Raises: + DeviceNotConnectedError: If the camera is not connected. + RuntimeError: If ``use_depth`` is ``False`` for this camera, or if + the background read thread is not running. + TimeoutError: If no frame becomes available within ``timeout_ms``. + """ + if not self.use_depth: + raise RuntimeError(f"{self}: cannot read depth — camera was configured with use_depth=False.") - return frame + return self._async_read(timeout_ms=timeout_ms, read_depth=True) + + @check_if_not_connected + def read_latest_depth(self, max_age_ms: int = 500) -> NDArray[Any]: + """Return the most recent depth frame in millimeters (peeking). + + Non-blocking counterpart of :meth:`read_latest` for the depth stream. + Output is ``np.uint16`` of shape ``(H, W, 1)``, where each pixel is the + distance from the sensor in millimeters. + + Raises: + DeviceNotConnectedError: If the camera is not connected. + RuntimeError: If ``use_depth`` is ``False`` for this camera, or if + no depth frame has been captured yet. + TimeoutError: If the latest depth frame is older than ``max_age_ms``. + """ + if not self.use_depth: + raise RuntimeError(f"{self}: cannot read depth — camera was configured with use_depth=False.") + + return self._read_latest(max_age_ms=max_age_ms, read_depth=True) def disconnect(self) -> None: """ diff --git a/src/lerobot/cameras/realsense/configuration_realsense.py b/src/lerobot/cameras/realsense/configuration_realsense.py index 71b083b00..018675195 100644 --- a/src/lerobot/cameras/realsense/configuration_realsense.py +++ b/src/lerobot/cameras/realsense/configuration_realsense.py @@ -42,12 +42,14 @@ class RealSenseCameraConfig(CameraConfig): height: Requested frame height in pixels for the color stream. serial_number_or_name: Unique serial number or human-readable name to identify the camera. color_mode: Color mode for image output (RGB or BGR). Defaults to RGB. + use_rgb: Whether to enable the color stream. Defaults to True. use_depth: Whether to enable depth stream. Defaults to False. rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation. warmup_s: Time reading frames before returning from connect (in seconds) Note: - Either name or serial_number must be specified. + - At least one of `use_rgb` or `use_depth` must be enabled. - Depth stream configuration (if enabled) will use the same FPS as the color stream. - The actual resolution and FPS may be adjusted by the camera to the nearest supported mode. - For `fps`, `width` and `height`, either all of them need to be set, or none of them. @@ -55,6 +57,7 @@ class RealSenseCameraConfig(CameraConfig): serial_number_or_name: str color_mode: ColorMode = ColorMode.RGB + use_rgb: bool = True use_depth: bool = False rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION warmup_s: int = 1 @@ -63,6 +66,9 @@ class RealSenseCameraConfig(CameraConfig): self.color_mode = ColorMode(self.color_mode) self.rotation = Cv2Rotation(self.rotation) + if not self.use_rgb and not self.use_depth: + raise ValueError("At least one of `use_rgb` or `use_depth` must be enabled.") + values = (self.fps, self.width, self.height) if any(v is not None for v in values) and any(v is None for v in values): raise ValueError( diff --git a/src/lerobot/cameras/zmq/camera_zmq.py b/src/lerobot/cameras/zmq/camera_zmq.py index f3df17814..cd32a117b 100644 --- a/src/lerobot/cameras/zmq/camera_zmq.py +++ b/src/lerobot/cameras/zmq/camera_zmq.py @@ -293,6 +293,8 @@ class ZMQCamera(Camera): if self.thread is not None and self.thread.is_alive(): self.thread.join(timeout=2.0) + if self.thread.is_alive(): + logger.warning(f"{self} read thread did not terminate within timeout.") self.thread = None self.stop_event = None diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py index be4491811..fa5942129 100644 --- a/src/lerobot/configs/__init__.py +++ b/src/lerobot/configs/__init__.py @@ -33,10 +33,15 @@ from .types import ( RTCAttentionSchedule, ) from .video import ( + DEFAULT_DEPTH_UNIT, VALID_VIDEO_CODECS, VIDEO_ENCODER_INFO_KEYS, + DepthEncoderConfig, + RGBEncoderConfig, VideoEncoderConfig, - camera_encoder_defaults, + depth_encoder_defaults, + encoder_config_from_video_info, + rgb_encoder_defaults, ) __all__ = [ @@ -57,9 +62,15 @@ __all__ = [ "WandBConfig", "load_recipe", "VideoEncoderConfig", + "RGBEncoderConfig", + "DepthEncoderConfig", # Defaults - "camera_encoder_defaults", + "rgb_encoder_defaults", + "depth_encoder_defaults", + # Factories + "encoder_config_from_video_info", # Constants + "DEFAULT_DEPTH_UNIT", "VALID_VIDEO_CODECS", "VIDEO_ENCODER_INFO_KEYS", ] diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py index c40c0fae2..7d30ca038 100644 --- a/src/lerobot/configs/dataset.py +++ b/src/lerobot/configs/dataset.py @@ -18,7 +18,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from .video import VideoEncoderConfig, camera_encoder_defaults +from .video import DepthEncoderConfig, RGBEncoderConfig, depth_encoder_defaults, rgb_encoder_defaults @dataclass @@ -58,8 +58,10 @@ class DatasetRecordConfig: # Set to 1 for immediate encoding (default behavior), or higher for batched encoding video_encoding_batch_size: int = 1 # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys, - # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``). - camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + # e.g. ``--dataset.rgb_encoder.vcodec=h264`` (see ``RGBEncoderConfig``). + rgb_encoder: RGBEncoderConfig = field(default_factory=rgb_encoder_defaults) + # Video encoder settings for depth-map MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys. + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) # Enable streaming video encoding: encode frames in real-time during capture instead # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding streaming_encoding: bool = False diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index 9b5433005..4f24b9dac 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -19,6 +19,8 @@ from dataclasses import dataclass, field from lerobot.transforms import ImageTransformsConfig from lerobot.utils.import_utils import get_safe_default_video_backend +from .video import DEFAULT_DEPTH_UNIT, DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT + @dataclass class DatasetConfig: @@ -35,14 +37,21 @@ class DatasetConfig: revision: str | None = None use_imagenet_stats: bool = True video_backend: str = field(default_factory=get_safe_default_video_backend) - # When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0). + # When True, RGB video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0). # This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion. return_uint8: bool = False + # Physical unit depth maps are dequantized to at load time: "mm" (millimeters) or "m" (metres). + # Has no effect on datasets without depth cameras. + depth_output_unit: str = DEFAULT_DEPTH_UNIT streaming: bool = False # Fraction of episodes held out per task for offline evaluation (0.0 = disabled). eval_split: float = 0.0 def __post_init__(self) -> None: + if self.depth_output_unit not in (DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT): + raise ValueError( + f"depth_output_unit must be '{DEPTH_METER_UNIT}' or '{DEPTH_MILLIMETER_UNIT}', got {self.depth_output_unit!r}" + ) if not (0.0 <= self.eval_split < 1.0): raise ValueError(f"eval_split must be in [0.0, 1.0), got {self.eval_split}") if self.episodes is not None: diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py index bf2471453..3ea834508 100644 --- a/src/lerobot/configs/video.py +++ b/src/lerobot/configs/video.py @@ -20,7 +20,7 @@ from __future__ import annotations import logging from dataclasses import dataclass, field -from typing import Any +from typing import Any, ClassVar, Self from lerobot.utils.import_utils import require_package @@ -40,7 +40,6 @@ VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "au # Aliases for legacy video codec names. VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"} - LIBSVTAV1_DEFAULT_PRESET: int = 12 # Keys persisted under ``features[*]["info"]`` as ``video.`` (from :class:`VideoEncoderConfig`). @@ -52,40 +51,45 @@ VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset( f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES ) +# Default depth quantization and encoding parameters. +DEPTH_QUANT_BITS: int = 12 +DEPTH_QMAX: int = (1 << DEPTH_QUANT_BITS) - 1 # 4095 + +DEFAULT_DEPTH_MIN: float = 0.01 +DEFAULT_DEPTH_MAX: float = 10.0 +DEFAULT_DEPTH_SHIFT: float = 3.5 +DEFAULT_DEPTH_USE_LOG: bool = True +DEFAULT_DEPTH_PIX_FMT: str = "gray12le" + +DEPTH_METER_UNIT: str = "m" +DEPTH_MILLIMETER_UNIT: str = "mm" +DEFAULT_DEPTH_UNIT: str = DEPTH_MILLIMETER_UNIT + +# Depth-specific tuning fields persisted under ``features[*]["info"]`` as ``video.``. +DEPTH_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset({"depth_min", "depth_max", "shift", "use_log"}) + @dataclass class VideoEncoderConfig: - """Video encoder configuration. + """Video encoder configuration.""" - Attributes: - vcodec: Video encoder name. ``"auto"`` is resolved during - construction (HW encoder if available, else ``libsvtav1``). - pix_fmt: Pixel format (e.g. ``"yuv420p"``). - g: GOP size (keyframe interval). - crf: Quality level — mapped to the native quality parameter of the - codec (``crf`` for software, ``qp`` for NVENC/VAAPI, - ``q:v`` for VideoToolbox, ``global_quality`` for QSV). - preset: Speed/quality preset. Accepted type is per-codec. - fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2) - embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values - set ``tune=fastdecode``. Ignored for other codecs. - video_backend: Python to be used for encoding. Only ``"pyav"`` - is currently supported. - extra_options: Free-form dictionary of additional video encoder options - (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``). - """ - - vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ? - pix_fmt: str = "yuv420p" - g: int | None = 2 - crf: int | float | None = 30 - preset: int | str | None = None - fast_decode: int = 0 + vcodec: str = "libsvtav1" # Video codec name. "auto" picks a hardware codec if available, else libsvtav1. + pix_fmt: str = "yuv420p" # Pixel format (e.g. yuv420p). + g: int | None = 2 # GOP size (keyframe interval). + crf: int | float | None = 30 # Quality level. Lower means better quality and larger files. + preset: int | str | None = None # Speed/quality preset. Accepted values are codec-specific. + fast_decode: int = 0 # Fast-decode tuning. Accepted values are codec-specific, 0 disables it. # TODO(CarolinePascal): add torchcodec support + find a way to unify the # two backends (encoding and decoding). - video_backend: str = "pyav" + video_backend: str = "pyav" # Encoding backend. Only "pyav" is currently supported. + # Extra codec options merged last, e.g. {"tune": "film"}. extra_options: dict[str, Any] = field(default_factory=dict) + # Source-data channel count this encoder is expected to handle. ``None`` + # disables the pix_fmt channel-count check; concrete subclasses set it + # (3 for RGB, 1 for depth, etc.). + _DEFAULT_CHANNELS: ClassVar[int | None] = None + def __post_init__(self) -> None: self.resolve_vcodec() # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work". @@ -94,9 +98,9 @@ class VideoEncoderConfig: self.validate() @classmethod - def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig: - """Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block. - Missing or ``None`` values fall back to the class defaults. + def _kwargs_from_video_info(cls, video_info: dict | None) -> dict[str, Any]: + """Parse the ``video.*`` keys of a feature ``info`` block into + constructor kwargs. """ video_info = video_info or {} kwargs: dict[str, Any] = {} @@ -115,7 +119,15 @@ class VideoEncoderConfig: continue kwargs[field_name] = value - return cls(**kwargs) + return kwargs + + @classmethod + def from_video_info(cls, video_info: dict | None) -> Self: + """Reconstruct an encoder config from a video feature's ``info`` block. + + Missing or ``None`` values fall back to the class defaults. + """ + return cls(**cls._kwargs_from_video_info(video_info)) def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: """Return the subset of available encoders based on the specified video backend. @@ -138,7 +150,9 @@ class VideoEncoderConfig: require_package("av", extra="dataset") from lerobot.datasets import check_video_encoder_parameters_pyav - check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options()) + check_video_encoder_parameters_pyav( + self.vcodec, self.pix_fmt, self.get_codec_options(), channels=self._DEFAULT_CHANNELS + ) def resolve_vcodec(self) -> None: """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder. @@ -230,6 +244,79 @@ class VideoEncoderConfig: return opts -def camera_encoder_defaults() -> VideoEncoderConfig: - """Return a :class:`VideoEncoderConfig` with RGB-camera defaults.""" - return VideoEncoderConfig() +@dataclass +class RGBEncoderConfig(VideoEncoderConfig): + """Encoder configuration for RGB camera streams. + + Identical to :class:`VideoEncoderConfig` but declares the 3-channel + source-data layout so ``pix_fmt`` is validated against RGB inputs. + """ + + _DEFAULT_CHANNELS: ClassVar[int] = 3 + + +def rgb_encoder_defaults() -> RGBEncoderConfig: + """Return a :class:`RGBEncoderConfig` with RGB-camera defaults.""" + return RGBEncoderConfig() + + +@dataclass +class DepthEncoderConfig(VideoEncoderConfig): + """Encoder configuration for depth-map streams. + + Inherits the full :class:`VideoEncoderConfig` surface (codec, GOP, CRF, + preset, ``extra_options``…) and adds the parameters of the depth quantizer. + Defaults flip ``vcodec`` to ``"hevc"`` (Main 12 profile) and ``pix_fmt`` to + ``"gray12le"``. + """ + + vcodec: str = "hevc" # Video codec name. Defaults to HEVC Main 12 (a 12-bit-capable codec). + pix_fmt: str = "gray12le" # Pixel format. Defaults to 12-bit grayscale. + extra_options: dict[str, Any] = field(default_factory=lambda: {"x265-params": "lossless=1"}) + + depth_min: float = DEFAULT_DEPTH_MIN # Minimum depth in meters, mapped to the lowest quantum. + depth_max: float = DEFAULT_DEPTH_MAX # Maximum depth in meters, mapped to the highest quantum. + shift: float = DEFAULT_DEPTH_SHIFT # Pre-log offset in meters for numerical stability near zero. + use_log: bool = DEFAULT_DEPTH_USE_LOG # Use logarithmic quantization (True) or linear (False). + + _DEFAULT_CHANNELS: ClassVar[int] = 1 + + @classmethod + def _kwargs_from_video_info(cls, video_info: dict | None) -> dict[str, Any]: + """Layer the depth-specific tuning (``depth_min`` / ``depth_max`` / + ``shift`` / ``use_log``) on top of the base parser. Missing keys + fall back to the class defaults. + """ + kwargs = super()._kwargs_from_video_info(video_info) + video_info = video_info or {} + for name in DEPTH_ENCODER_INFO_FIELD_NAMES: + value = video_info.get(f"video.{name}") + if value is not None: + kwargs[name] = value + return kwargs + + +def depth_encoder_defaults() -> DepthEncoderConfig: + """Return a :class:`DepthEncoderConfig` with depth-camera defaults.""" + return DepthEncoderConfig() + + +def encoder_config_from_video_info(video_info: dict | None) -> VideoEncoderConfig: + """Build the appropriate encoder config from a feature's ``info`` block. + + Dispatches to :class:`DepthEncoderConfig` when the dict marks the feature + as a depth map and to :class:`RGBEncoderConfig` + otherwise. + + Args: + video_info: A feature's ``info`` dict as persisted in ``info.json``, + or ``None`` (treated as an empty dict). + + Returns: + A :class:`DepthEncoderConfig` for depth features, otherwise a + :class:`RGBEncoderConfig`. + """ + video_info = video_info or {} + is_depth = bool(video_info.get("is_depth_map") or video_info.get("video.is_depth_map")) + cls: type[VideoEncoderConfig] = DepthEncoderConfig if is_depth else RGBEncoderConfig + return cls.from_video_info(video_info) diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py index 09765c130..88f7ea226 100644 --- a/src/lerobot/datasets/compute_stats.py +++ b/src/lerobot/datasets/compute_stats.py @@ -242,12 +242,12 @@ def sample_images(image_paths: list[str]) -> np.ndarray: images = None for i, idx in enumerate(sampled_indices): path = image_paths[idx] - # we load as uint8 to reduce memory usage + # we load RGB images as uint8 to reduce memory usage; depth keeps its native dtype img = load_image_as_numpy(path, dtype=np.uint8, channel_first=True) img = auto_downsample_height_width(img) if images is None: - images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8) + images = np.empty((len(sampled_indices), *img.shape), dtype=img.dtype) images[i] = img @@ -506,8 +506,10 @@ def compute_episode_stats( Each statistics dictionary contains min, max, mean, std, count, and quantiles. Note: - Image statistics are normalized to [0,1] range and have shape (3,1,1) for - per-channel values when dtype is 'image' or 'video'. + For 'image'/'video' features, stats are computed per channel and kept with a + leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by + 255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip + this rescaling and remain in their stored units. """ if quantile_list is None: quantile_list = DEFAULT_QUANTILES @@ -531,8 +533,12 @@ def compute_episode_stats( ) if features[key]["dtype"] in ["image", "video"]: + normalization_factor = ( + 255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0 + ) ep_stats[key] = { - k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items() + k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0) + for k, v in ep_stats[key].items() } return ep_stats @@ -552,8 +558,10 @@ def _validate_stat_value(value: np.ndarray, key: str, feature_key: str) -> None: if key == "count" and value.shape != (1,): raise ValueError(f"Shape of 'count' must be (1), but is {value.shape} instead.") - if "image" in feature_key and key != "count" and value.shape != (3, 1, 1): - raise ValueError(f"Shape of quantile '{key}' must be (3,1,1), but is {value.shape} instead.") + if "image" in feature_key and key != "count" and value.shape not in ((3, 1, 1), (1, 1, 1)): + raise ValueError( + f"Shape of quantile '{key}' must be (3,1,1) or (1,1,1) but is {value.shape} instead." + ) def _assert_type_and_shape(stats_list: list[dict[str, dict]]): diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py index b496e4f65..ea329668c 100644 --- a/src/lerobot/datasets/dataset_metadata.py +++ b/src/lerobot/datasets/dataset_metadata.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib -from collections.abc import Callable +import logging +from collections.abc import Callable, Iterable from copy import deepcopy from pathlib import Path @@ -338,6 +339,25 @@ class LeRobotDatasetMetadata: """Keys to access visual modalities stored as videos.""" return [key for key, ft in self.features.items() if ft["dtype"] == "video"] + @property + def depth_keys(self) -> list[str]: + """Keys to access depth-map modalities stored as videos or images. + + A depth key is a feature whose ``info`` dict carries ``"is_depth_map": True`` + (or the legacy ``"video.is_depth_map"`` inside ``info`` or ``video_info``). + """ + + def _is_depth(ft: dict) -> bool: + info = ft.get("info") or {} + video_info = ft.get("video_info") or {} + return ( + info.get("is_depth_map", False) + or info.get("video.is_depth_map", False) + or video_info.get("video.is_depth_map", False) + ) + + return [key for key, ft in self.features.items() if _is_depth(ft)] + @property def camera_keys(self) -> list[str]: """Keys to access visual modalities (regardless of their storage method).""" @@ -581,29 +601,48 @@ class LeRobotDatasetMetadata: def update_video_info( self, video_key: str | None = None, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, + preserve_keys: Iterable[str] | None = None, ) -> None: - """Populate per-feature video info in ``info.json``. + """Populate or refresh per-feature video info in ``info.json``. Warning: this function writes info from first episode videos, implicitly assuming that all videos have been encoded the same way. Also, this means it assumes the first episode exists. + Always re-probes the videos and overwrites existing info for every recomputed + key. ``preserve_keys`` lists keys whose existing values must be kept (e.g. + data-intrinsic entries like ``is_depth_map`` and depth quantization params) + instead of being recomputed. + Args: video_key: If provided, only update this video key. Otherwise update all video keys in the dataset. - camera_encoder: Encoder configuration used to produce the + video_encoder: Encoder configuration used to produce the videos. When provided, its fields are recorded as ``video.`` entries alongside the stream-derived ``video.*`` entries (see :func:`get_video_info`). + preserve_keys: Keys whose existing values are kept instead of being + recomputed. ``None`` (default) recomputes every key. """ if video_key is not None and video_key not in self.video_keys: raise ValueError(f"Video key {video_key} not found in dataset") video_keys = [video_key] if video_key is not None else self.video_keys + preserve_set = set(preserve_keys or ()) for key in video_keys: - if not self.features[key].get("info", None): - video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0) - self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder) + existing = self.features[key].get("info") or {} + video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0) + new_info = get_video_info(video_path, video_encoder=video_encoder) + # Drop preserved keys so the existing values win on merge. + new_info = {k: v for k, v in new_info.items() if k not in preserve_set} + merged = {**existing, **new_info} + # Migrate the legacy depth marker to the canonical key. + if "video.is_depth_map" in merged: + logging.warning( + f"Migrating legacy 'video.is_depth_map' to 'is_depth_map' for feature {key!r}." + ) + merged.setdefault("is_depth_map", merged.pop("video.is_depth_map")) + self.info.features[key]["info"] = merged def update_chunk_settings( self, diff --git a/src/lerobot/datasets/dataset_reader.py b/src/lerobot/datasets/dataset_reader.py index d7289ac48..e8e07301e 100644 --- a/src/lerobot/datasets/dataset_reader.py +++ b/src/lerobot/datasets/dataset_reader.py @@ -22,7 +22,10 @@ from pathlib import Path import datasets import torch +from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig + from .dataset_metadata import LeRobotDatasetMetadata +from .depth_utils import dequantize_depth from .feature_utils import ( check_delta_timestamps, get_delta_indices, @@ -51,6 +54,7 @@ class DatasetReader: delta_timestamps: dict[str, list[float]] | None, image_transforms: Callable | None, return_uint8: bool = False, + depth_output_unit: str = DEFAULT_DEPTH_UNIT, ): """Initialize the reader with metadata, filtering, and transform config. @@ -68,6 +72,10 @@ class DatasetReader: relative timestamp offsets for temporal context windows. image_transforms: Optional torchvision v2 transform applied to visual features. + return_uint8: If True, return RGB video frames as raw uint8 tensors + instead of normalized float32. + depth_output_unit: Physical unit depth maps are dequantized to + (``"m"`` or ``"mm"``). Defaults to ``"mm"``. """ self._meta = meta self.root = root @@ -78,6 +86,7 @@ class DatasetReader: raise TypeError("image_transforms must be callable or None.") self._image_transforms = image_transforms self._return_uint8 = return_uint8 + self._depth_output_unit = depth_output_unit self.hf_dataset: datasets.Dataset | None = None self._absolute_to_relative_idx: dict[int, int] | None = None @@ -88,6 +97,11 @@ class DatasetReader: check_delta_timestamps(delta_timestamps, meta.fps, tolerance_s) self.delta_indices = get_delta_indices(delta_timestamps, meta.fps) + self._depth_encoder_configs: dict[str, DepthEncoderConfig] = { + vid_key: DepthEncoderConfig.from_video_info(self._meta.features[vid_key].get("info")) + for vid_key in self._meta.depth_keys + } + def set_image_transforms(self, image_transforms: Callable | None) -> None: """Replace the transform applied to visual observations.""" if image_transforms is not None and not callable(image_transforms): @@ -259,7 +273,18 @@ class DatasetReader: self._tolerance_s, self._video_backend, return_uint8=self._return_uint8, + is_depth=vid_key in self._meta.depth_keys, ) + if vid_key in self._meta.depth_keys: + depth_encoder = self._depth_encoder_configs[vid_key] + frames = dequantize_depth( + frames, + depth_min=depth_encoder.depth_min, + depth_max=depth_encoder.depth_max, + shift=depth_encoder.shift, + use_log=depth_encoder.use_log, + output_unit=self._depth_output_unit, + ) return vid_key, frames.squeeze(0) items = list(query_timestamps.items()) @@ -299,8 +324,9 @@ class DatasetReader: item = {**video_frames, **item} if self._image_transforms is not None: - image_keys = self._meta.camera_keys - for cam in image_keys: + for cam in self._meta.camera_keys: + if cam in self._meta.depth_keys: + continue item[cam] = self._image_transforms(item[cam]) # Add task as a string diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 9aca859b4..31e075d7c 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -37,7 +37,15 @@ import pyarrow.parquet as pq import torch from tqdm import tqdm -from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults +from lerobot.configs import ( + DepthEncoderConfig, + RGBEncoderConfig, + VideoEncoderConfig, + depth_encoder_defaults, + encoder_config_from_video_info, + rgb_encoder_defaults, +) +from lerobot.configs.video import DEPTH_ENCODER_INFO_FIELD_NAMES from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE from lerobot.utils.utils import flatten_dict @@ -48,6 +56,7 @@ from .compute_stats import ( compute_relative_action_stats, ) from .dataset_metadata import LeRobotDatasetMetadata +from .image_writer import write_image from .io_utils import ( get_parquet_file_size_in_mb, load_episodes, @@ -62,12 +71,13 @@ from .utils import ( DEFAULT_DATA_FILE_SIZE_IN_MB, DEFAULT_DATA_PATH, DEFAULT_EPISODES_PATH, + DEPTH_FILE_PATTERN, + IMAGE_FILE_PATTERN, VIDEO_DIR, update_chunk_file_indices, ) from .video_utils import ( encode_video_frames, - get_video_info, reencode_video, ) @@ -601,7 +611,7 @@ def _keep_episodes_from_video_with_av( output_path: Path, episodes_to_keep: list[tuple[int, int]], fps: float, - camera_encoder: VideoEncoderConfig, + video_encoder: VideoEncoderConfig, ) -> None: """Keep only specified episodes from a video file using PyAV. @@ -615,7 +625,7 @@ def _keep_episodes_from_video_with_av( Ranges are half-open intervals: [start_frame, end_frame), where start_frame is inclusive and end_frame is exclusive. fps: Frame rate of the video. - camera_encoder: Video encoder settings used to re-encode the kept frames. + video_encoder: Video encoder settings used to re-encode the kept frames. """ from fractions import Fraction @@ -640,13 +650,13 @@ def _keep_episodes_from_video_with_av( # Convert fps to Fraction for PyAV compatibility. fps_fraction = Fraction(fps).limit_denominator(1000) - codec_options = camera_encoder.get_codec_options(as_strings=True) - v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options) + codec_options = video_encoder.get_codec_options(as_strings=True) + v_out = out.add_stream(video_encoder.vcodec, rate=fps_fraction, options=codec_options) # PyAV type stubs don't distinguish video streams from audio/subtitle streams. v_out.width = v_in.codec_context.width v_out.height = v_in.codec_context.height - v_out.pix_fmt = camera_encoder.pix_fmt + v_out.pix_fmt = video_encoder.pix_fmt # Set time_base to match the frame rate for proper timestamp handling. v_out.time_base = Fraction(1, int(fps)) @@ -733,7 +743,7 @@ def _copy_and_reindex_videos( for video_key in src_dataset.meta.video_keys: logging.info(f"Processing videos for {video_key}") - camera_encoder = VideoEncoderConfig.from_video_info( + video_encoder = encoder_config_from_video_info( src_dataset.meta.info.features.get(video_key, {}).get("info") ) @@ -817,7 +827,7 @@ def _copy_and_reindex_videos( dst_video_path, episodes_to_keep_ranges, src_dataset.meta.fps, - camera_encoder, + video_encoder, ) cumulative_ts = 0.0 @@ -874,11 +884,11 @@ def _copy_and_reindex_episodes_metadata( episode_meta.update(video_metadata[new_idx]) # Extract episode statistics from parquet metadata. - # Note (maractingi): When pandas/pyarrow serializes numpy arrays with shape (3, 1, 1) to parquet, + # When pandas/pyarrow serializes numpy arrays with shape (C, 1, 1) to parquet, # they are being deserialized as nested object arrays like: # array([array([array([0.])]), array([array([0.])]), array([array([0.])])]) # This happens particularly with image/video statistics. We need to detect and flatten - # these nested structures back to proper (3, 1, 1) arrays so aggregate_stats can process them. + # these nested structures back to proper (C, 1, 1) arrays so aggregate_stats can process them. episode_stats = {} for key in src_episode_full: if key.startswith("stats/"): @@ -894,15 +904,16 @@ def _copy_and_reindex_episodes_metadata( if feature_name in src_dataset.meta.features: feature_dtype = src_dataset.meta.features[feature_name]["dtype"] if feature_dtype in ["image", "video"] and stat_name != "count": + # Stats are channel-first (C, 1, 1) if isinstance(value, np.ndarray) and value.dtype == object: flat_values = [] for item in value: while isinstance(item, np.ndarray): item = item.flatten()[0] flat_values.append(item) - value = np.array(flat_values, dtype=np.float64).reshape(3, 1, 1) - elif isinstance(value, np.ndarray) and value.shape == (3,): - value = value.reshape(3, 1, 1) + value = np.array(flat_values, dtype=np.float64).reshape(-1, 1, 1) + elif isinstance(value, np.ndarray) and value.ndim == 1: + value = value.reshape(-1, 1, 1) episode_stats[feature_name][stat_name] = value @@ -1153,15 +1164,15 @@ def _save_episode_images_for_video( # Get all items for this episode episode_dataset = imgs_dataset.select(range(from_idx, to_idx)) + is_depth = img_key in dataset.meta.depth_keys + frame_pattern = DEPTH_FILE_PATTERN if is_depth else IMAGE_FILE_PATTERN + # Define function to save a single image def save_single_image(i_item_tuple): i, item = i_item_tuple - img = item[img_key] - # Use frame-XXXXXX.png format to match encode_video_frames expectations - img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100) + write_image(item[img_key], imgs_dir / frame_pattern.format(frame_index=i)) return i - # Save images with proper naming convention for encode_video_frames (frame-XXXXXX.png) items = list(enumerate(episode_dataset)) with ThreadPoolExecutor(max_workers=num_workers) as executor: @@ -1193,13 +1204,14 @@ def _save_batch_episodes_images( hf_dataset = dataset.hf_dataset.with_format(None) imgs_dataset = hf_dataset.select_columns(img_key) + is_depth = img_key in dataset.meta.depth_keys + frame_pattern = DEPTH_FILE_PATTERN if is_depth else IMAGE_FILE_PATTERN + # Define function to save a single image with global frame index # Defined once outside the loop to avoid repeated closure creation def save_single_image(i_item_tuple, base_frame_idx, img_key_param): i, item = i_item_tuple - img = item[img_key_param] - # Use global frame index for naming - img.save(str(imgs_dir / f"frame-{base_frame_idx + i:06d}.png"), quality=100) + write_image(item[img_key_param], imgs_dir / frame_pattern.format(frame_index=base_frame_idx + i)) return i episode_durations = [] @@ -1290,7 +1302,7 @@ def _estimate_frame_size_via_calibration( episode_indices: list[int], temp_dir: Path, fps: int, - camera_encoder: VideoEncoderConfig, + video_encoder: VideoEncoderConfig, num_calibration_frames: int = 30, ) -> float: """Estimate MB per frame by encoding a small calibration sample. @@ -1304,7 +1316,7 @@ def _estimate_frame_size_via_calibration( episode_indices: List of episode indices being processed. temp_dir: Temporary directory for calibration files. fps: Frames per second for video encoding. - camera_encoder: Video encoder settings used for calibration encoding. + video_encoder: Video encoder settings used for calibration encoding. num_calibration_frames: Number of frames to use for calibration (default: 30). Returns: @@ -1329,10 +1341,11 @@ def _estimate_frame_size_via_calibration( hf_dataset = dataset.hf_dataset.with_format(None) sample_indices = range(from_idx, from_idx + num_frames) - # Save calibration frames + # Save calibration frames using the suffix/format the encoder expects. + is_depth = img_key in dataset.meta.depth_keys + frame_pattern = DEPTH_FILE_PATTERN if is_depth else IMAGE_FILE_PATTERN for i, idx in enumerate(sample_indices): - img = hf_dataset[idx][img_key] - img.save(str(calibration_dir / f"frame-{i:06d}.png"), quality=100) + write_image(hf_dataset[idx][img_key], calibration_dir / frame_pattern.format(frame_index=i)) # Encode calibration video calibration_video_path = calibration_dir / "calibration.mp4" @@ -1340,7 +1353,7 @@ def _estimate_frame_size_via_calibration( imgs_dir=calibration_dir, video_path=calibration_video_path, fps=fps, - camera_encoder=camera_encoder, + video_encoder=video_encoder, overwrite=True, ) @@ -1613,6 +1626,7 @@ def recompute_stats( raise ValueError(f"No parquet files found in {data_dir}") all_episode_stats = [] + # TODO: enable image and video stats re-computation numeric_keys = [k for k, v in features_to_compute.items() if v["dtype"] not in ["image", "video"]] for parquet_path in tqdm(parquet_files, desc="Computing stats from data files"): @@ -1658,7 +1672,8 @@ def convert_image_to_video_dataset( dataset: LeRobotDataset, output_dir: Path | None = None, repo_id: str | None = None, - camera_encoder: VideoEncoderConfig | None = None, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, episode_indices: list[int] | None = None, num_workers: int = 4, max_episodes_per_batch: int | None = None, @@ -1670,21 +1685,32 @@ def convert_image_to_video_dataset( LeRobot dataset structure with videos stored in chunked MP4 files. Args: - dataset: The source LeRobot dataset with images - output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. - repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. - camera_encoder: Video encoder settings - (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). - episode_indices: List of episode indices to convert (None = all episodes) - num_workers: Number of threads for parallel processing (default: 4) - max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) - max_frames_per_batch: Maximum frames per video batch to avoid memory issues (None = no limit) + dataset: The source LeRobot dataset with images. + output_dir: Root directory where the converted dataset will be stored. When + ``None``, defaults to ``$HF_LEROBOT_HOME/repo_id``. Equivalent to + ``new_root`` in ``EditDatasetConfig``. + repo_id: Converted dataset identifier. Equivalent to ``new_repo_id`` in + ``EditDatasetConfig``. + rgb_encoder: Video encoder settings applied to RGB cameras. When ``None``, + :func:`~lerobot.configs.video.rgb_encoder_defaults` is used. + depth_encoder: Video encoder settings applied to depth-map cameras, including + the quantization parameters persisted to the dataset metadata. When + ``None``, :func:`~lerobot.configs.video.depth_encoder_defaults` is used. + episode_indices: Episode indices to convert. When ``None``, all episodes are + converted. + num_workers: Number of threads for parallel processing. + max_episodes_per_batch: Maximum episodes per video batch, to bound memory use. + ``None`` means no limit. + max_frames_per_batch: Maximum frames per video batch, to bound memory use. + ``None`` means no limit. Returns: - New LeRobotDataset with images encoded as videos + A new :class:`LeRobotDataset` with images encoded as videos. """ - if camera_encoder is None: - camera_encoder = camera_encoder_defaults() + if rgb_encoder is None: + rgb_encoder = rgb_encoder_defaults() + if depth_encoder is None: + depth_encoder = depth_encoder_defaults() # Check that it's an image dataset if len(dataset.meta.video_keys) > 0: @@ -1709,10 +1735,7 @@ def convert_image_to_video_dataset( logging.info( f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" ) - logging.info( - f"Video codec: {camera_encoder.vcodec}, pixel format: {camera_encoder.pix_fmt}, " - f"GOP: {camera_encoder.g}, CRF: {camera_encoder.crf}" - ) + logging.info(f"RGB video encoder: {rgb_encoder}, depth video encoder: {depth_encoder}") # Create new features dict, converting image features to video features new_features = {} @@ -1774,6 +1797,8 @@ def convert_image_to_video_dataset( episode_lengths = {ep_idx: dataset.meta.episodes["length"][ep_idx] for ep_idx in episode_indices} for img_key in tqdm(img_keys, desc="Processing cameras"): + target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else rgb_encoder + # Estimate size per frame by encoding a small calibration sample # This provides accurate compression ratio for the specific codec parameters size_per_frame_mb = _estimate_frame_size_via_calibration( @@ -1782,7 +1807,7 @@ def convert_image_to_video_dataset( episode_indices=episode_indices, temp_dir=temp_dir, fps=fps, - camera_encoder=camera_encoder, + video_encoder=target_encoder, ) logging.info(f"Processing camera: {img_key}") @@ -1824,7 +1849,7 @@ def convert_image_to_video_dataset( imgs_dir=imgs_dir, video_path=video_path, fps=fps, - camera_encoder=camera_encoder, + video_encoder=target_encoder, overwrite=True, ) @@ -1863,16 +1888,11 @@ def convert_image_to_video_dataset( new_meta.info.total_tasks = dataset.meta.total_tasks new_meta.info.splits = {"train": f"0:{len(episode_indices)}"} - # Update video info for all image keys (now videos) - # We need to manually set video info since update_video_info() checks video_keys first + # Update video info for all image keys (now videos). They are registered as + # video features above, so update_video_info populates their (still-empty) info. for img_key in img_keys: - if not new_meta.features[img_key].get("info", None): - video_path = new_meta.root / new_meta.video_path.format( - video_key=img_key, chunk_index=0, file_index=0 - ) - new_meta.info.features[img_key]["info"] = get_video_info( - video_path, camera_encoder=camera_encoder - ) + target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else rgb_encoder + new_meta.update_video_info(video_key=img_key, video_encoder=target_encoder) write_info(new_meta.info, new_meta.root) @@ -1899,11 +1919,11 @@ def convert_image_to_video_dataset( def _reencode_video_worker(args: tuple) -> Path: """Picklable worker for :func:`reencode_dataset`'s process pool.""" - video_path, camera_encoder, encoder_threads = args + video_path, video_encoder, encoder_threads = args reencode_video( input_video_path=video_path, output_video_path=video_path, - camera_encoder=camera_encoder, + video_encoder=video_encoder, encoder_threads=encoder_threads, overwrite=True, ) @@ -1912,7 +1932,8 @@ def _reencode_video_worker(args: tuple) -> Path: def reencode_dataset( dataset: LeRobotDataset, - camera_encoder: VideoEncoderConfig, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, encoder_threads: int | None = None, num_workers: int | None = None, ) -> LeRobotDataset: @@ -1923,8 +1944,11 @@ def reencode_dataset( Args: dataset: An existing :class:`LeRobotDataset` whose videos will be re-encoded. - camera_encoder: Target encoder configuration applied to every video - file. + rgb_encoder: Target encoder configuration applied to every RGB video + file. If ``None``, re-encoding is skipped for RGB videos. + depth_encoder: Target encoder configuration applied to every depth video + file. If ``None``, re-encoding is skipped for depth videos. + Quantization parameters will not override the ones in the current dataset. encoder_threads: Per-encoder thread count forwarded to :func:`reencode_video`. ``None`` lets the codec decide. num_workers: Number of parallel processes. ``None`` or ``0`` means @@ -1936,23 +1960,35 @@ def reencode_dataset( on disk. """ meta = dataset.meta - video_paths_list = [] + video_keys_encoders_dict = {} + video_keys_paths_dict = {} + + if rgb_encoder is None and depth_encoder is None: + raise ValueError("Either rgb_encoder or depth_encoder must be provided") # Only re-encode if the videos are not already encoded with the given video encoding parameters for video_key in meta.video_keys: current_info = meta.info.features[video_key].get("info", {}) - current_encoder = VideoEncoderConfig.from_video_info(current_info) - if current_encoder != camera_encoder: - video_paths_list.extend((meta.root / VIDEO_DIR / video_key).rglob("*.mp4")) + current_encoder = encoder_config_from_video_info(current_info) + target_encoder = depth_encoder if video_key in meta.depth_keys else rgb_encoder + if target_encoder is None: + logging.info(f"No encoder provided for {video_key} video. Skipping re-encoding.") + elif current_encoder != target_encoder: + video_keys_paths_dict[video_key] = list((meta.root / VIDEO_DIR / video_key).rglob("*.mp4")) + video_keys_encoders_dict[video_key] = target_encoder else: - logging.info(f"{video_key} videos are already encoded with {camera_encoder}. Nothing to do.") + logging.info(f"{video_key} videos are already encoded with {target_encoder}. Nothing to do.") - if len(video_paths_list) == 0: + if len(video_keys_paths_dict) == 0: logging.warning("Dataset has no videos to re-encode.") return dataset - logging.info(f"Re-encoding {len(video_paths_list)} video file(s) with {camera_encoder}") + logging.info(f"Re-encoding {sum(len(paths) for paths in video_keys_paths_dict.values())} video file(s).") - worker_args = [(vp, camera_encoder, encoder_threads) for vp in video_paths_list] + worker_args = [ + (path, encoder, encoder_threads) + for video_key, encoder in video_keys_encoders_dict.items() + for path in video_keys_paths_dict[video_key] + ] if num_workers and num_workers > 1: with ProcessPoolExecutor(max_workers=num_workers) as pool: futures = [pool.submit(_reencode_video_worker, args) for args in worker_args] @@ -1966,10 +2002,15 @@ def reencode_dataset( for args in tqdm(worker_args, desc="Re-encoding videos"): _reencode_video_worker(args) - # Refresh video info in metadata for every video key. - for vid_key in meta.video_keys: - video_path = meta.root / meta.get_video_file_path(0, vid_key) - meta.info.features[vid_key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder) + # Refresh video info in metadata for every re-encoded key. Re-encoding only + # changes codec/container params, so for depth videos we preserve ``is_depth_map`` + # and the depth quantization params (``video.depth_min`` / ``video.depth_max`` / + # ...), which describe the data rather than the codec and must survive a transcode. + # RGB videos pass an empty set: still a refresh, but nothing to preserve. + depth_preserve_keys = {"is_depth_map", *(f"video.{n}" for n in DEPTH_ENCODER_INFO_FIELD_NAMES)} + for video_key, encoder in video_keys_encoders_dict.items(): + preserve_keys = depth_preserve_keys if video_key in meta.depth_keys else set() + meta.update_video_info(video_key=video_key, video_encoder=encoder, preserve_keys=preserve_keys) write_info(meta.info, meta.root) logging.info("Dataset metadata updated.") diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index 633c00c1a..1aee1497c 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -31,7 +31,13 @@ import PIL.Image import pyarrow.parquet as pq import torch -from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults +from lerobot.configs import ( + DepthEncoderConfig, + RGBEncoderConfig, + VideoEncoderConfig, + depth_encoder_defaults, + rgb_encoder_defaults, +) from .compute_stats import compute_episode_stats from .dataset_metadata import LeRobotDatasetMetadata @@ -48,6 +54,7 @@ from .io_utils import ( write_info, ) from .utils import ( + DEFAULT_DEPTH_PATH, DEFAULT_EPISODES_PATH, DEFAULT_IMAGE_PATH, update_chunk_file_indices, @@ -67,17 +74,22 @@ def _encode_video_worker( episode_index: int, root: Path, fps: int, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, ) -> Path: temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" - fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) + path_template = ( + DEFAULT_DEPTH_PATH + if video_encoder is not None and isinstance(video_encoder, DepthEncoderConfig) + else DEFAULT_IMAGE_PATH + ) + fpath = path_template.format(image_key=video_key, episode_index=episode_index, frame_index=0) img_dir = (root / fpath).parent encode_video_frames( img_dir, temp_path, fps, - camera_encoder=camera_encoder, + video_encoder=video_encoder, encoder_threads=encoder_threads, overwrite=True, ) @@ -96,7 +108,8 @@ class DatasetWriter: self, meta: LeRobotDatasetMetadata, root: Path, - camera_encoder: VideoEncoderConfig | None, + rgb_encoder: RGBEncoderConfig | None, + depth_encoder: DepthEncoderConfig | None, encoder_threads: int | None, batch_encoding_size: int, streaming_encoder: StreamingVideoEncoder | None = None, @@ -108,8 +121,11 @@ class DatasetWriter: meta: Dataset metadata instance (used for feature schema, chunk settings, and episode persistence). root: Local dataset root directory. - camera_encoder: Video encoder settings applied to all cameras. - ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`. + rgb_encoder: Video encoder settings applied to RGB cameras. When + ``None``, :func:`~lerobot.configs.video.rgb_encoder_defaults` is used. + depth_encoder: Video encoder settings applied to depth cameras, including + the quantization parameters. When ``None``, + :func:`~lerobot.configs.video.depth_encoder_defaults` is used. encoder_threads: Number of encoder threads (global). ``None`` lets the codec decide. batch_encoding_size: Number of episodes to accumulate before @@ -120,7 +136,8 @@ class DatasetWriter: """ self._meta = meta self._root = root - self._camera_encoder = camera_encoder or camera_encoder_defaults() + self._rgb_encoder = rgb_encoder or rgb_encoder_defaults() + self._depth_encoder = depth_encoder or depth_encoder_defaults() self._encoder_threads = encoder_threads self._batch_encoding_size = batch_encoding_size self._streaming_encoder = streaming_encoder @@ -145,7 +162,8 @@ class DatasetWriter: return ep_buffer def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path: - fpath = DEFAULT_IMAGE_PATH.format( + path_template = DEFAULT_DEPTH_PATH if image_key in self._meta.depth_keys else DEFAULT_IMAGE_PATH + fpath = path_template.format( image_key=image_key, episode_index=episode_index, frame_index=frame_index ) return self._root / fpath @@ -195,6 +213,7 @@ class DatasetWriter: if frame_index == 0 and self._streaming_encoder is not None: self._streaming_encoder.start_episode( video_keys=list(self._meta.video_keys), + depth_video_keys=list(self._meta.depth_keys), temp_dir=self._root, ) @@ -282,10 +301,13 @@ class DatasetWriter: if use_streaming: streaming_results = self._streaming_encoder.finish_episode() for video_key in self._meta.video_keys: + normalization_factor = 255.0 if video_key not in self._meta.depth_keys else 1.0 temp_path, video_stats = streaming_results[video_key] if video_stats is not None: ep_stats[video_key] = { - k: v if k == "count" else np.squeeze(v.reshape(1, -1, 1, 1) / 255.0, axis=0) + k: v + if k == "count" + else np.squeeze(v.reshape(1, -1, 1, 1) / normalization_factor, axis=0) for k, v in video_stats.items() } ep_metadata.update(self._save_episode_video(video_key, episode_index, temp_path=temp_path)) @@ -300,7 +322,7 @@ class DatasetWriter: episode_index, self._root, self._meta.fps, - self._camera_encoder, + self._depth_encoder if video_key in self._meta.depth_keys else self._rgb_encoder, self._encoder_threads, ): video_key for video_key in self._meta.video_keys @@ -511,7 +533,12 @@ class DatasetWriter: # Update video info (only needed when first episode is encoded) if episode_index == 0: - self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder) + self._meta.update_video_info( + video_key, + video_encoder=self._depth_encoder + if video_key in self._meta.depth_keys + else self._rgb_encoder, + ) write_info(self._meta.info, self._meta.root) metadata = { @@ -578,13 +605,14 @@ class DatasetWriter: self.image_writer.wait_until_done() def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: - """Use ffmpeg to convert frames stored as png into mp4 videos.""" + """Use ffmpeg to convert frames stored as png/tiff into mp4 videos.""" + is_depth = video_key in self._meta.depth_keys return _encode_video_worker( video_key, episode_index, self._root, self._meta.fps, - self._camera_encoder, + self._depth_encoder if is_depth else self._rgb_encoder, self._encoder_threads, ) diff --git a/src/lerobot/datasets/depth_utils.py b/src/lerobot/datasets/depth_utils.py new file mode 100644 index 000000000..801c86a09 --- /dev/null +++ b/src/lerobot/datasets/depth_utils.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Depth encoding/decoding helpers for :class:`DepthEncoderConfig`. +""" + +import math +from typing import Literal + +import av +import numpy as np +import torch +from numpy.typing import NDArray + +from lerobot.configs.video import ( + DEFAULT_DEPTH_MAX, + DEFAULT_DEPTH_MIN, + DEFAULT_DEPTH_PIX_FMT, + DEFAULT_DEPTH_SHIFT, + DEFAULT_DEPTH_USE_LOG, + DEPTH_METER_UNIT, + DEPTH_MILLIMETER_UNIT, + DEPTH_QMAX, +) + +from .image_writer import squeeze_single_channel +from .pyav_utils import write_u16_plane + +_MM_PER_METRE = 1000.0 +_UINT16_MAX = 65535 + + +def _validate_log_quant_params(depth_min: float, shift: float) -> None: + """Ensure ``log(depth_min + shift)`` is finite.""" + if depth_min + shift <= 0: + raise ValueError( + f"depth_min + shift must be positive for logarithmic quantization, " + f"got depth_min={depth_min} + shift={shift} = {depth_min + shift}" + ) + + +def _depth_input_to_float32_and_unit( + depth: NDArray[np.integer] | NDArray[np.floating], + input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT], +) -> tuple[NDArray[np.float32], Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]]: + """Convert depth to float32 in the chosen unit, and return the resolved unit.""" + resolved_unit = ( + (DEPTH_METER_UNIT if np.issubdtype(depth.dtype, np.floating) else DEPTH_MILLIMETER_UNIT) + if input_unit == "auto" + else input_unit + ) + return depth.astype(np.float32, order="K"), resolved_unit + + +def quantize_depth( + depth: NDArray[np.uint16] | NDArray[np.float32] | torch.Tensor, + depth_min: float = DEFAULT_DEPTH_MIN, + depth_max: float = DEFAULT_DEPTH_MAX, + shift: float = DEFAULT_DEPTH_SHIFT, + use_log: bool = DEFAULT_DEPTH_USE_LOG, + pix_fmt: str = DEFAULT_DEPTH_PIX_FMT, + video_backend: str | None = "pyav", + input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT] = "auto", +) -> NDArray[np.uint16] | av.VideoFrame: + """Quantize depth to 12-bit codes (``uint16``, values ``0…DEPTH_QMAX``). + + Depth maps are packed into 12-bit integer frames so they fit in standard + high-bit-depth pixel formats (e.g. ``yuv420p12le`` / ``gray12le``) + and can be encoded by widely supported video codecs (e.g. HEVC Main 12). + Logarithmic quantization is the default because it allocates more quanta + to near-range depth, which matches the (1/depth) error profile of typical + depth sensors. Math is ported from BEHAVIOR-1K's ``obs_utils.py``. + + **Input units**: + + - ``input_unit="auto"`` (default): infer from dtype (floating = m, non-floating = mm). + - ``input_unit="mm"``: interpret input values as millimetres. + - ``input_unit="m"``: interpret input values as metres. + + Quantization math runs in the **resolved input unit**. + + ``depth_min``, ``depth_max``, and ``shift`` are always in **metres**. + + Args: + depth: Depth map; ``torch.Tensor`` is moved to CPU for conversion. + depth_min: Depth (metres) at quantum ``0``. + depth_max: Depth (metres) at quantum :data:`DEPTH_QMAX`. + shift: Depth shift (metres); used in log mode. Must satisfy ``depth_min + shift > 0``. + use_log: If ``True`` (default), quantize in log space. + video_backend: Video backend to use for encoding. Defaults to "pyav". + input_unit: Input unit policy (``"auto"``, ``"mm"``, ``"m"``). + + Returns: + ``numpy.ndarray``, ``dtype=uint16``, same shape as ``depth``, values in + ``[0, DEPTH_QMAX]``. + + Raises: + ValueError: If ``input_unit`` is not ``"auto"``, ``"mm"``, or ``"m"``. + ValueError: If ``use_log=True`` and ``depth_min + shift <= 0``. + """ + if input_unit not in ("auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT): + raise ValueError( + f"input_unit must be 'auto', '{DEPTH_METER_UNIT}', or '{DEPTH_MILLIMETER_UNIT}', got {input_unit!r}" + ) + + if isinstance(depth, torch.Tensor): + depth = depth.detach().cpu().numpy() + + # Squeeze single-channel dim: (H, W, 1) or (1, H, W) → (H, W) + depth = squeeze_single_channel(depth) + + depth_f, resolved_unit = _depth_input_to_float32_and_unit(depth, input_unit=input_unit) + + # Convert depth_min, depth_max, and shift to the resolved input unit. + depth_min_u = ( + np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE) + ) + depth_max_u = ( + np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE) + ) + shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE) + + # Normalization and quantization is performed in the resolved input unit. + if use_log: + _validate_log_quant_params(depth_min, shift) + log_min = math.log(float(depth_min_u + shift_u)) + log_max = math.log(float(depth_max_u + shift_u)) + norm = (np.log(depth_f + shift_u) - log_min) / (log_max - log_min) + else: + norm = (depth_f - depth_min_u) / (depth_max_u - depth_min_u) + + quantized = np.rint(norm * DEPTH_QMAX).clip(0, DEPTH_QMAX).astype(np.uint16, copy=False) + + if video_backend == "pyav": + frame = av.VideoFrame.from_ndarray(quantized, format=pix_fmt) + write_u16_plane(frame.planes[0], quantized) + return frame + else: + return quantized + + +def dequantize_depth( + quantized: NDArray[np.uint16] | av.VideoFrame | torch.Tensor, + depth_min: float = DEFAULT_DEPTH_MIN, + depth_max: float = DEFAULT_DEPTH_MAX, + shift: float = DEFAULT_DEPTH_SHIFT, + use_log: bool = DEFAULT_DEPTH_USE_LOG, + pix_fmt: str = DEFAULT_DEPTH_PIX_FMT, + output_unit: Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT] = DEPTH_MILLIMETER_UNIT, + output_tensor: bool = True, + output_channel_last: bool = False, +) -> NDArray[np.uint16] | NDArray[np.float32] | torch.Tensor: + """Inverse of :func:`quantize_depth`. + + Decoding inverts the same normalized code mapping as :func:`quantize_depth` + using ``depth_min`` / ``depth_max`` / ``shift`` (in metres), then returns + the requested output unit. Tuning arguments **must match** :func:`quantize_depth`. + + Accepted input layouts : + + - ``(H, W, 1)`` or ``(H, W)`` — single frame with channel-last. + - ``(..., 1, H, W)`` — batched frames with channel-first. + - ``(..., H, W, 1)`` — batched frames with channel-last. + Output layout is determined by ``output_channel_last``. + + Args: + quantized: 12-bit codes in ``[0, DEPTH_QMAX]``. ``np.ndarray``, + ``av.VideoFrame``, or ``torch.Tensor`` (any integer or float dtype). + depth_min, depth_max, shift, use_log: Same as :func:`quantize_depth` (metres). + pix_fmt: Pixel format used to extract the plane from an ``av.VideoFrame``. + output_unit: ``"mm"`` returns ``uint16`` millimetres (rint, clip + ``[0, 65535]``) when returning a numpy array, or ``float32`` mm when + ``output_tensor=True``. ``"m"`` returns ``float32`` metres in + ``[depth_min, depth_max]``. + output_tensor: If True, return a ``torch.Tensor`` instead of a numpy array. + + Returns: + Depth map in the requested unit and dtype. + + Raises: + ValueError: If ``output_unit`` is not ``"m"`` or ``"mm"``. + ValueError: If ``use_log=True`` and ``depth_min + shift <= 0``. + """ + if output_unit not in (DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT): + raise ValueError( + f"output_unit must be '{DEPTH_METER_UNIT}' or '{DEPTH_MILLIMETER_UNIT}', got {output_unit!r}" + ) + if use_log: + _validate_log_quant_params(depth_min, shift) + + if isinstance(quantized, av.VideoFrame): + quantized = quantized.to_ndarray(format=pix_fmt) + + # Compute the scale and offset first. + depth_min_m = float(depth_min) + depth_max_m = float(depth_max) + shift_m = float(shift) + if use_log: + log_min = math.log(depth_min_m + shift_m) + log_max = math.log(depth_max_m + shift_m) + scale = (log_max - log_min) / DEPTH_QMAX + offset = log_min + else: + scale = (depth_max_m - depth_min_m) / DEPTH_QMAX + offset = depth_min_m + + # ── Torch path: stay on the input device, single fp32 allocation. ──────── + if isinstance(quantized, torch.Tensor): + if quantized.ndim >= 3: + # Drop the single-channel dimension so the math runs on (..., H, W). + quantized = quantized.squeeze(-3) if quantized.shape[-3] == 1 else quantized.squeeze(-1) + + # Single allocation we own; everything else is in-place. + buf = quantized.to(dtype=torch.float32, copy=True) + buf.mul_(scale).add_(offset) + if use_log: + buf.exp_().sub_(shift_m) + buf.clamp_(depth_min_m, depth_max_m) + buf.unsqueeze_(-1) if output_channel_last else buf.unsqueeze_(-3) + + if output_unit == DEPTH_METER_UNIT: + return buf if output_tensor else buf.cpu().numpy() + + # mm path: round + clamp in float32, skipping the uint16 round-trip + # when returning a tensor (torch.uint16 is poorly supported). + buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX) + if output_tensor: + return buf + return buf.cpu().numpy().astype(np.uint16, copy=False) + + # ── NumPy path: single fp32 allocation, ``out=`` for in-place math. ───── + arr = np.asarray(quantized) + if arr.ndim >= 3: + # Drop the single-channel dimension so the math runs on (..., H, W). + arr = np.squeeze(arr, axis=-3) if arr.shape[-3] == 1 else np.squeeze(arr, axis=-1) + + buf = np.empty(arr.shape, dtype=np.float32) + np.multiply(arr, scale, out=buf) + np.add(buf, offset, out=buf) + if use_log: + np.exp(buf, out=buf) + np.subtract(buf, shift_m, out=buf) + np.clip(buf, depth_min_m, depth_max_m, out=buf) + buf = np.expand_dims(buf, axis=-1) if output_channel_last else np.expand_dims(buf, axis=-3) + + if output_unit == DEPTH_METER_UNIT: + return torch.from_numpy(buf) if output_tensor else buf + + np.multiply(buf, _MM_PER_METRE, out=buf) + np.rint(buf, out=buf) + np.clip(buf, 0.0, _UINT16_MAX, out=buf) + if output_tensor: + # torch.uint16 support is very limited; return float32 millimetres. + return torch.from_numpy(buf) + return buf.astype(np.uint16, copy=False) diff --git a/src/lerobot/datasets/factory.py b/src/lerobot/datasets/factory.py index cd29ee99e..da7b4365a 100644 --- a/src/lerobot/datasets/factory.py +++ b/src/lerobot/datasets/factory.py @@ -97,6 +97,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas revision=cfg.dataset.revision, video_backend=cfg.dataset.video_backend, return_uint8=True, + depth_output_unit=cfg.dataset.depth_output_unit, tolerance_s=cfg.tolerance_s, ) else: @@ -127,6 +128,8 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas if cfg.dataset.use_imagenet_stats: for key in dataset.meta.camera_keys: + if key in dataset.meta.depth_keys: + continue # Exclude depth keys from ImageNet stats for stats_type, stats in IMAGENET_STATS.items(): dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32) diff --git a/src/lerobot/datasets/feature_utils.py b/src/lerobot/datasets/feature_utils.py index 56264408f..343b2fdcc 100644 --- a/src/lerobot/datasets/feature_utils.py +++ b/src/lerobot/datasets/feature_utils.py @@ -336,7 +336,7 @@ def validate_feature_image_or_video( Args: name (str): The name of the feature. - expected_shape (list[str]): The expected shape (C, H, W). + expected_shape (list[str]): The expected shape, e.g. (C, H, W) or (H, W, C). value: The image data to validate. Returns: diff --git a/src/lerobot/datasets/image_writer.py b/src/lerobot/datasets/image_writer.py index 8fb5804a5..41790b46a 100644 --- a/src/lerobot/datasets/image_writer.py +++ b/src/lerobot/datasets/image_writer.py @@ -41,11 +41,51 @@ def safe_stop_image_writer(func): return wrapper -def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image: - # TODO(aliberts): handle 1 channel and 4 for depth images - if image_array.ndim != 3: - raise ValueError(f"The array has {image_array.ndim} dimensions, but 3 is expected for an image.") +def squeeze_single_channel(array: np.ndarray) -> np.ndarray: + """Drop a leading or trailing singleton channel dim: ``(1, H, W)`` / ``(H, W, 1)`` -> ``(H, W)``. + Unlike ``array.squeeze()``, this only removes the channel axis, never an ``H`` or ``W`` of size 1. + """ + if array.ndim == 3: + if array.shape[0] == 1: + return array[0] + if array.shape[-1] == 1: + return array[..., 0] + return array + + +def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image: + """Convert a NumPy array to a PIL Image, preserving precision for grayscale. + + Behaviour by shape: + + - ``(H, W)`` or ``(1, H, W)`` / ``(H, W, 1)``: single-channel grayscale. + The native dtype is preserved using the matching PIL mode + (``I;16`` / ``F``). This is the path used for raw depth maps (no rescaling, clamping, or downcasting) + - ``(3, H, W)`` / ``(H, W, 3)``: RGB. Channels-first inputs are transposed + to channels-last. Float inputs in ``[0, 1]`` are scaled to ``uint8`` + (existing behaviour, gated by ``range_check``). + + Other shapes / channel counts raise ``NotImplementedError`` or + ``ValueError``. + """ + # TODO(CarolinePascal): 4 dimensions RGB-D images + if image_array.ndim not in (2, 3): + raise ValueError(f"The array has {image_array.ndim} dimensions, but 2 or 3 is expected for an image.") + + # Squeeze 3D single-channel inputs to 2D so depth maps work whether the + # caller emits (H, W), (1, H, W), or (H, W, 1). + image_array = squeeze_single_channel(image_array) + + if image_array.ndim == 2: + if image_array.dtype not in [np.uint16, np.float32]: + raise ValueError( + f"Unsupported single-channel image dtype: {image_array.dtype}. " + f"Supported dtypes: {sorted(str(d) for d in [np.uint16, np.float32])}." + ) + return PIL.Image.fromarray(np.ascontiguousarray(image_array)) + + # 3D path: must be RGB (3 channels), channels-first or channels-last. if image_array.shape[0] == 3: # Transpose from pytorch convention (C, H, W) to (H, W, C) image_array = image_array.transpose(1, 2, 0) @@ -71,13 +111,29 @@ def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) return PIL.Image.fromarray(image_array) +def save_kwargs_for_path(fpath: Path, compress_level: int) -> dict: + """Pick the right format-specific kwargs for :meth:`PIL.Image.Image.save`. + + PNG uses ``compress_level`` (0-9, zlib). TIFF uses ``compression`` (raw) for lossless raw depth maps. + """ + suffix = Path(fpath).suffix.lower() + if suffix == ".png": + return {"compress_level": compress_level} + if suffix in (".tif", ".tiff"): + return {"compression": "raw"} + else: + raise ValueError(f"Unsupported image file extension: {suffix}") + + def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level: int = 1): """ Saves a NumPy array or PIL Image to a file. This function handles both NumPy arrays and PIL Image objects, converting the former to a PIL Image before saving. It includes error handling for - the save operation. + the save operation. The output format is inferred from the *fpath* + extension: ``.png`` → PNG with ``compress_level``, ``.tiff`` / ``.tif`` + → lossless raw depth maps (TIFF). Args: image (np.ndarray | PIL.Image.Image): The image data to save. @@ -101,7 +157,7 @@ def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level img = image else: raise TypeError(f"Unsupported image type: {type(image)}") - img.save(fpath, compress_level=compress_level) + img.save(fpath, **save_kwargs_for_path(fpath, compress_level)) except Exception as e: logger.error("Error writing image %s: %s", fpath, e) diff --git a/src/lerobot/datasets/io_utils.py b/src/lerobot/datasets/io_utils.py index be94f3b3a..868a114f5 100644 --- a/src/lerobot/datasets/io_utils.py +++ b/src/lerobot/datasets/io_utils.py @@ -226,28 +226,50 @@ def load_image_as_numpy( Args: fpath (str | Path): Path to the image file. dtype (np.dtype): The desired data type of the output array. If floating, - pixels are scaled to [0, 1]. + pixels are scaled to [0, 1]. Only used for RGB images. channel_first (bool): If True, converts the image to (C, H, W) format. Otherwise, it remains in (H, W, C) format. Returns: np.ndarray: The image as a numpy array. """ - img = PILImage.open(fpath).convert("RGB") - img_array = np.array(img, dtype=dtype) + is_depth = fpath.endswith(".tiff") or fpath.endswith(".tif") + if is_depth: + # Preserve the native depth dtype (uint16 -> "I;16", float32 -> "F"). + img = PILImage.open(fpath) + img_array = np.array(img) + else: + img = PILImage.open(fpath).convert("RGB") + img_array = np.array(img, dtype=dtype) + if np.issubdtype(dtype, np.floating): + img_array /= 255.0 if channel_first: # (H, W, C) -> (C, H, W) - img_array = np.transpose(img_array, (2, 0, 1)) - if np.issubdtype(dtype, np.floating): - img_array /= 255.0 + img_array = img_array[np.newaxis, ...] if img_array.ndim == 2 else np.transpose(img_array, (2, 0, 1)) return img_array +# PIL modes for 16-bit unsigned depth maps. +UINT16_PIL_MODES = {"I;16", "I;16B", "I;16L"} + + +def pil_to_chw_tensor(img: PILImage.Image) -> torch.Tensor: + """Convert a PIL image to a channel-first tensor. + + ``uint16`` depth maps become ``float32 (1, H, W)`` in native units (``ToTensor`` + would overflow them to ``int16``); all other modes use the standard ``ToTensor`` path. + """ + if img.mode in UINT16_PIL_MODES: + return torch.from_numpy(np.array(img, dtype=np.float32))[None, ...] + return transforms.ToTensor()(img) + + def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[torch.Tensor | str]]: """Convert a batch from a Hugging Face dataset to torch tensors. This transform function converts items from Hugging Face dataset format (pyarrow) - to torch tensors. Importantly, images are converted from PIL objects (H, W, C, uint8) - to a torch image representation (C, H, W, float32) in the range [0, 1]. Other + to torch tensors. RGB images are converted from PIL objects (H, W, C, uint8) + to a torch image representation (C, H, W, float32) in the range [0, 1]. Depth + maps are returned as float32 (1, H, W) in their native units. Other types are converted to torch.tensor. Args: @@ -262,8 +284,7 @@ def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[to continue first_item = items_dict[key][0] if isinstance(first_item, PILImage.Image): - to_tensor = transforms.ToTensor() - items_dict[key] = [to_tensor(img) for img in items_dict[key]] + items_dict[key] = [pil_to_chw_tensor(img) for img in items_dict[key]] elif first_item is None or isinstance(first_item, dict): pass else: @@ -329,7 +350,11 @@ def item_to_torch(item: dict) -> dict: """ skip_keys = {"task", *LANGUAGE_COLUMNS} for key, val in item.items(): - if isinstance(val, (np.ndarray | list)) and key not in skip_keys: + if key in skip_keys: + continue + if isinstance(val, PILImage.Image): + item[key] = pil_to_chw_tensor(val) + elif isinstance(val, (np.ndarray | list)): # Convert numpy arrays and lists to torch tensors item[key] = torch.tensor(val) return item diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 49e77b53a..f600f1804 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -24,7 +24,7 @@ import torch.utils from huggingface_hub import HfApi, snapshot_download from huggingface_hub.errors import RevisionNotFoundError -from lerobot.configs import VideoEncoderConfig +from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig, RGBEncoderConfig from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata @@ -58,8 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset): download_videos: bool = True, video_backend: str | None = None, return_uint8: bool = False, + depth_output_unit: str = DEFAULT_DEPTH_UNIT, batch_encoding_size: int = 1, - camera_encoder: VideoEncoderConfig | None = None, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, encoder_threads: int | None = None, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -183,8 +185,11 @@ class LeRobotDataset(torch.utils.data.Dataset): You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision. batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos. Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1. - camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras - (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` + rgb_encoder (RGBEncoderConfig | None, optional): Video encoder settings for cameras + (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.rgb_encoder_defaults` + is used by the writer. + depth_encoder (DepthEncoderConfig | None, optional): Video encoder settings for depth cameras + (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.depth_encoder_defaults` is used by the writer. encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the codec decide. @@ -206,6 +211,7 @@ class LeRobotDataset(torch.utils.data.Dataset): self.revision = revision if revision else CODEBASE_VERSION self._video_backend = video_backend if video_backend else get_safe_default_video_backend() self._return_uint8 = return_uint8 + self._depth_output_unit = depth_output_unit self._batch_encoding_size = batch_encoding_size self._encoder_threads = encoder_threads @@ -246,6 +252,7 @@ class LeRobotDataset(torch.utils.data.Dataset): delta_timestamps=delta_timestamps, image_transforms=image_transforms, return_uint8=self._return_uint8, + depth_output_unit=self._depth_output_unit, ) self.image_transforms = image_transforms @@ -271,14 +278,16 @@ class LeRobotDataset(torch.utils.data.Dataset): if streaming_encoding and len(self.meta.video_keys) > 0: streaming_enc = self._build_streaming_encoder( self.meta.fps, - camera_encoder, + rgb_encoder, + depth_encoder, encoder_queue_maxsize, encoder_threads, ) self.writer = DatasetWriter( meta=self.meta, root=self.root, - camera_encoder=camera_encoder, + rgb_encoder=rgb_encoder, + depth_encoder=depth_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -314,19 +323,22 @@ class LeRobotDataset(torch.utils.data.Dataset): delta_timestamps=self.delta_timestamps, image_transforms=self.image_transforms, return_uint8=self._return_uint8, + depth_output_unit=self._depth_output_unit, ) return self.reader @staticmethod def _build_streaming_encoder( fps: int, - camera_encoder: VideoEncoderConfig | None, + rgb_encoder: RGBEncoderConfig | None, + depth_encoder: DepthEncoderConfig | None, encoder_queue_maxsize: int, encoder_threads: int | None, ) -> StreamingVideoEncoder: return StreamingVideoEncoder( fps=fps, - camera_encoder=camera_encoder, + rgb_encoder=rgb_encoder, + depth_encoder=depth_encoder, queue_maxsize=encoder_queue_maxsize, encoder_threads=encoder_threads, ) @@ -655,7 +667,8 @@ class LeRobotDataset(torch.utils.data.Dataset): image_writer_threads: int = 0, video_backend: str | None = None, batch_encoding_size: int = 1, - camera_encoder: VideoEncoderConfig | None = None, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, metadata_buffer_size: int = 10, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -686,8 +699,10 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend (used when reading back). batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. ``1`` means encode immediately. - camera_encoder: Video encoder settings for cameras (codec, quality, etc.). - When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. + rgb_encoder: Video encoder settings for cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.rgb_encoder_defaults` is used. + depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.depth_encoder_defaults` is used. encoder_threads: Number of encoder threads (global). ``None`` lets the codec decide. metadata_buffer_size: Number of episode metadata records to buffer @@ -722,6 +737,7 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.episodes = None obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend() obj._return_uint8 = False + obj._depth_output_unit = DEFAULT_DEPTH_UNIT obj._batch_encoding_size = batch_encoding_size obj._encoder_threads = encoder_threads @@ -731,12 +747,13 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: streaming_enc = cls._build_streaming_encoder( - fps, camera_encoder, encoder_queue_maxsize, encoder_threads + fps, rgb_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - camera_encoder=camera_encoder, + rgb_encoder=rgb_encoder, + depth_encoder=depth_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -759,7 +776,8 @@ class LeRobotDataset(torch.utils.data.Dataset): force_cache_sync: bool = False, video_backend: str | None = None, batch_encoding_size: int = 1, - camera_encoder: VideoEncoderConfig | None = None, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, encoder_threads: int | None = None, image_writer_processes: int = 0, image_writer_threads: int = 0, @@ -787,8 +805,10 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend for reading back data. batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. - camera_encoder: Video encoder settings for cameras (codec, quality, etc.). - When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. + rgb_encoder: Video encoder settings for cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.rgb_encoder_defaults` is used. + depth_encoder: Video encoder settings for depth cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.depth_encoder_defaults` is used. encoder_threads: Number of encoder threads (global). ``None`` lets the codec decide. image_writer_processes: Subprocesses for async image writing. @@ -816,6 +836,7 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.episodes = None obj._video_backend = video_backend if video_backend else get_safe_default_video_backend() obj._return_uint8 = False + obj._depth_output_unit = DEFAULT_DEPTH_UNIT obj._batch_encoding_size = batch_encoding_size if obj._requested_root is not None: @@ -835,12 +856,13 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: streaming_enc = cls._build_streaming_encoder( - obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads + obj.meta.fps, rgb_encoder, depth_encoder, encoder_queue_maxsize, encoder_threads ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - camera_encoder=camera_encoder, + rgb_encoder=rgb_encoder, + depth_encoder=depth_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py index d291f8b40..7b7d1e5de 100644 --- a/src/lerobot/datasets/pyav_utils.py +++ b/src/lerobot/datasets/pyav_utils.py @@ -24,6 +24,7 @@ import logging from typing import Any import av +import numpy as np logger = logging.getLogger(__name__) @@ -31,6 +32,34 @@ FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE") FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64") +def write_u16_plane(plane: av.video.plane.VideoPlane, src: np.ndarray, fill_value: int | None = None) -> None: + """Copy a 2D ``uint16`` image into the plane's memory buffer, row by row. + + For speed, each row is padded to a wider size than ``width``, so the true row width in + memory is ``plane.line_size`` (bytes), not ``width``. Copying as one straight stream + would skew the image, so we write only the first ``width`` columns of each row and + leave the padding untouched. + + Args: + plane: Destination 16-bit plane. + src: Source image, shape ``(height, width)``, dtype ``uint16``. + fill_value: If given, every pixel (padding included) is set to this first, so the + padding holds clean data instead of garbage. + """ + height, width = src.shape + stride_u16 = plane.line_size // np.dtype(np.uint16).itemsize + dst = np.frombuffer(plane, dtype=np.uint16).reshape(height, stride_u16) + if fill_value is not None: + dst.fill(fill_value) + dst[:, :width] = src + + +@functools.cache +def get_pix_fmt_channels(pix_fmt: str) -> int: + """Return the number of components (channels) for *pix_fmt*.""" + return len(av.VideoFormat(pix_fmt).components) + + @functools.cache def get_codec(vcodec: str) -> av.codec.Codec | None: """PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable.""" @@ -92,7 +121,7 @@ def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Opti f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." ) from e elif isinstance(value, (float, int)): - num_val = value + num_val = float(value) else: raise ValueError( f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." @@ -142,6 +171,16 @@ def _check_pixel_format(vcodec: str, pix_fmt: str) -> None: ) +def _check_pix_fmt_channels(pix_fmt: str, channels: int) -> None: + """Ensure *pix_fmt* can carry at least *channels* components.""" + pix_fmt_channels = get_pix_fmt_channels(pix_fmt) + if pix_fmt_channels < channels: + raise ValueError( + f"pix_fmt={pix_fmt!r} carries only {pix_fmt_channels} component(s) " + f"but the source data has {channels} channel(s)." + ) + + def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None: """Validate merged encoder options (typed) against the codec's published AVOptions.""" supported_options = _get_codec_options_by_name(vcodec) @@ -156,12 +195,18 @@ def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None: _check_option_value(vcodec, key, value, supported_options[key]) -def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options: dict[str, Any]) -> None: +def check_video_encoder_parameters_pyav( + vcodec: str, + pix_fmt: str, + codec_options: dict[str, Any], + channels: int | None = None, +) -> None: """Verify *config* is compatible with the bundled FFmpeg build. Checks pixel format, abstract tuning-field compatibility, and each merged encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options` against PyAV (including numeric ``extra_options`` present in that dict). + When given, additionally verify that *pix_fmt* carries as many components as the source data channels. No-op when ``config.vcodec`` isn't in the local FFmpeg build. Raises: @@ -171,4 +216,6 @@ def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options if not options: raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build") _check_pixel_format(vcodec, pix_fmt) + if channels is not None: + _check_pix_fmt_channels(pix_fmt, channels) _check_codec_options(vcodec, codec_options) diff --git a/src/lerobot/datasets/streaming_dataset.py b/src/lerobot/datasets/streaming_dataset.py index 3c1e4a73c..4c4ae59bf 100644 --- a/src/lerobot/datasets/streaming_dataset.py +++ b/src/lerobot/datasets/streaming_dataset.py @@ -22,9 +22,11 @@ import numpy as np import torch from datasets import load_dataset +from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig from lerobot.utils.constants import HF_LEROBOT_HOME, LOOKAHEAD_BACKTRACKTABLE, LOOKBACK_BACKTRACKTABLE from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata +from .depth_utils import dequantize_depth from .feature_utils import get_delta_indices from .io_utils import item_to_torch from .utils import ( @@ -35,6 +37,7 @@ from .utils import ( ) from .video_utils import ( VideoDecoderCache, + decode_video_frames, decode_video_frames_torchcodec, ) @@ -252,6 +255,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset): rng: np.random.Generator | None = None, shuffle: bool = True, return_uint8: bool = False, + depth_output_unit: str = DEFAULT_DEPTH_UNIT, ): """Initialize a StreamingLeRobotDataset. @@ -272,6 +276,8 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset): seed (int, optional): Reproducibility random seed. rng (np.random.Generator | None, optional): Random number generator. shuffle (bool, optional): Whether to shuffle the dataset across exhaustions. Defaults to True. + depth_output_unit (str, optional): Physical unit depth maps are dequantized to ("m" or "mm"). + Defaults to "mm". """ super().__init__() self.repo_id = repo_id @@ -290,6 +296,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset): self.streaming = streaming self.buffer_size = buffer_size self._return_uint8 = return_uint8 + self._depth_output_unit = depth_output_unit # We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown) self.video_decoder_cache = None @@ -306,6 +313,11 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset): # Check version check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION) + self._depth_encoder_configs: dict[str, DepthEncoderConfig] = { + vid_key: DepthEncoderConfig.from_video_info(self.meta.features[vid_key].get("info")) + for vid_key in self.meta.depth_keys + } + self.delta_timestamps = None self.delta_indices = None @@ -554,13 +566,34 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset): for video_key, query_ts in query_timestamps.items(): root = self.meta.url_root if self.streaming and not self.streaming_from_local else self.root video_path = f"{root}/{self.meta.get_video_file_path(ep_idx, video_key)}" - frames = decode_video_frames_torchcodec( - video_path, - query_ts, - self.tolerance_s, - decoder_cache=self.video_decoder_cache, - return_uint8=self._return_uint8, - ) + if video_key in self.meta.depth_keys: + # Depth maps are 12-bit quantized and only decodable via pyav; dequantize back + # to physical units to match the non-streaming reader. + frames = decode_video_frames( + video_path, + query_ts, + self.tolerance_s, + backend="pyav", + return_uint8=False, + is_depth=True, + ) + depth_encoder = self._depth_encoder_configs[video_key] + frames = dequantize_depth( + frames, + depth_min=depth_encoder.depth_min, + depth_max=depth_encoder.depth_max, + shift=depth_encoder.shift, + use_log=depth_encoder.use_log, + output_unit=self._depth_output_unit, + ) + else: + frames = decode_video_frames_torchcodec( + video_path, + query_ts, + self.tolerance_s, + decoder_cache=self.video_decoder_cache, + return_uint8=self._return_uint8, + ) item[video_key] = frames.squeeze(0) if len(query_ts) == 1 else frames diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index de91978ea..d30761515 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -87,11 +87,14 @@ DATA_DIR = "data" VIDEO_DIR = "videos" CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}" +IMAGE_FILE_PATTERN = "frame-{frame_index:06d}.png" +DEPTH_FILE_PATTERN = "frame-{frame_index:06d}.tiff" DEFAULT_TASKS_PATH = "meta/tasks.parquet" DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet" DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet" DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4" -DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png" +DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/" + IMAGE_FILE_PATTERN +DEFAULT_DEPTH_PATH = "images/{image_key}/episode-{episode_index:06d}/" + DEPTH_FILE_PATTERN LEGACY_EPISODES_PATH = "meta/episodes.jsonl" LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl" diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index ca90fba45..ef3005dd8 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -39,11 +39,17 @@ from datasets.features.features import register_feature from PIL import Image from lerobot.configs import ( + DepthEncoderConfig, + RGBEncoderConfig, VideoEncoderConfig, - camera_encoder_defaults, + depth_encoder_defaults, + rgb_encoder_defaults, ) from lerobot.utils.import_utils import get_safe_default_video_backend +from .depth_utils import quantize_depth +from .pyav_utils import get_pix_fmt_channels + logger = logging.getLogger(__name__) @@ -53,6 +59,7 @@ def decode_video_frames( tolerance_s: float, backend: str | None = None, return_uint8: bool = False, + is_depth: bool = False, ) -> torch.Tensor: """ Decodes video frames using the specified backend. @@ -64,23 +71,35 @@ def decode_video_frames( backend (str, optional): Backend to use for decoding. Defaults to "torchcodec" when available in the platform; otherwise, defaults to "pyav". The legacy value "video_reader" is accepted for one release as an alias for "pyav" and will be removed in a future version. - return_uint8 (bool): If True, return raw uint8 frames without float32 normalization. + return_uint8 (bool): For RGB videos, if True return raw uint8 frames without float32 normalization. This reduces memory for DataLoader IPC; normalization can be done on GPU afterward. + is_depth (bool): Set to True if the video is a depth map (1 channel, uint12). Returns: - torch.Tensor: Decoded frames (float32 in [0,1] by default, or uint8 if return_uint8=True). + torch.Tensor: Decoded frames (RGB: float32 in [0,1] by default, or uint8 if return_uint8=True, Depth: uint12). Currently supports torchcodec on cpu and pyav. """ + if backend != "pyav" and is_depth: + logger.debug("Decoding depth maps is only supported with the 'pyav' backend, falling back to pyav.") + # We do not actually return uint8 here, but we avoid the 255 normalization step. + return decode_video_frames_pyav( + video_path, timestamps, tolerance_s, return_uint8=False, is_depth=True + ) + if backend is None: backend = get_safe_default_video_backend() if backend == "torchcodec": return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8) elif backend == "pyav": - return decode_video_frames_pyav(video_path, timestamps, tolerance_s, return_uint8=return_uint8) + return decode_video_frames_pyav( + video_path, timestamps, tolerance_s, return_uint8=return_uint8, is_depth=is_depth + ) elif backend == "video_reader": logger.warning("backend='video_reader' is deprecated and now aliases to 'pyav'.") - return decode_video_frames_pyav(video_path, timestamps, tolerance_s, return_uint8=return_uint8) + return decode_video_frames_pyav( + video_path, timestamps, tolerance_s, return_uint8=return_uint8, is_depth=is_depth + ) else: raise ValueError(f"Unsupported video backend: {backend}") @@ -91,6 +110,7 @@ def decode_video_frames_pyav( tolerance_s: float, log_loaded_timestamps: bool = False, return_uint8: bool = False, + is_depth: bool = False, ) -> torch.Tensor: """Loads frames associated to the requested timestamps of a video using PyAV. @@ -109,8 +129,9 @@ def decode_video_frames_pyav( tolerance_s: Allowed deviation in seconds between a queried timestamp and the closest decoded frame. log_loaded_timestamps: When True, log every decoded frame's timestamp at INFO level. - return_uint8: When True, return raw uint8 frames (C, H, W). Otherwise, return float32 in - [0, 1] range. + return_uint8: For RGB videos, if True return raw uint8 frames (C, H, W). + Otherwise, return float32 in [0, 1] range. + is_depth: Set to True if the video is a depth map (1 channel, uint12). Returns: torch.Tensor of shape (len(timestamps), C, H, W). @@ -132,7 +153,13 @@ def decode_video_frames_pyav( # https://pyav.basswood-io.com/docs/stable/api/container.html#av.container.InputContainer.seek with av.open(video_path) as container: stream = container.streams.video[0] - container.seek(int(first_ts * av.time_base), backward=True) + # Seek to the nearest keyframe at or before `first_ts` with a 1 frame margin + container.seek( + round(first_ts / stream.time_base) - 1, + backward=True, + any_frame=False, + stream=stream, + ) for frame in container.decode(stream): if frame.pts is None: @@ -140,9 +167,13 @@ def decode_video_frames_pyav( current_ts = float(frame.pts * stream.time_base) if log_loaded_timestamps: logger.info(f"frame loaded at timestamp={current_ts:.4f}") - # Convert to CHW uint8 to match torchcodec's output layout. - arr = frame.to_ndarray(format="rgb24") # H, W, 3 - loaded_frames.append(torch.from_numpy(arr).permute(2, 0, 1).contiguous()) + if is_depth: + arr = frame.to_ndarray(format="gray12le") # (H, W) uint12 + loaded_frames.append(torch.from_numpy(arr).unsqueeze(0).contiguous()) + else: + arr = frame.to_ndarray(format="rgb24") # (H, W, 3) + # Convert to CHW uint8 to match torchcodec's output layout. + loaded_frames.append(torch.from_numpy(arr).permute(2, 0, 1).contiguous()) loaded_ts.append(current_ts) if current_ts >= last_ts: break @@ -185,7 +216,7 @@ def decode_video_frames_pyav( f"number of queried timestamps ({len(timestamps)})" ) - if return_uint8: + if return_uint8 or is_depth: return closest_frames # convert to the pytorch format which is float32 in [0,1] range (and channel first) @@ -406,17 +437,38 @@ def encode_video_frames( imgs_dir: Path | str, video_path: Path | str, fps: int, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, *, log_level: int | None = av.logging.WARNING, overwrite: bool = False, ) -> None: - """More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" - if camera_encoder is None: - camera_encoder = camera_encoder_defaults() - vcodec = camera_encoder.vcodec - pix_fmt = camera_encoder.pix_fmt + """Encode a directory of image frames into an MP4 video. + + When ``video_encoder`` is a :class:`~lerobot.configs.video.DepthEncoderConfig`, + frames are read from ``.tiff`` files and quantized to 12-bit depth codes using the + encoder's ``depth_min`` / ``depth_max`` / ``shift`` / ``use_log``; otherwise ``.png`` + RGB frames are encoded directly. + + Args: + imgs_dir: Directory containing the frames to encode, named ``frame-000000`` + onwards (``.png`` for RGB, ``.tiff`` for depth). + video_path: Output path for the encoded ``.mp4`` file. + fps: Frame rate of the output video. + video_encoder: Encoder settings (codec, pixel format, quality, ...). When + ``None``, :func:`rgb_encoder_defaults` is used. Pass a + :class:`~lerobot.configs.video.DepthEncoderConfig` to encode depth frames. + encoder_threads: Per-encoder thread count forwarded to the codec. ``None`` + lets the codec decide. + log_level: libav log level to set while encoding, or ``None`` to leave the + current logging configuration unchanged. + overwrite: When ``False`` and ``video_path`` already exists, skip encoding and + log a warning. When ``True``, re-encode and replace the existing file. + """ + if video_encoder is None: + video_encoder = rgb_encoder_defaults() + vcodec = video_encoder.vcodec + pix_fmt = video_encoder.pix_fmt video_path = Path(video_path) imgs_dir = Path(imgs_dir) @@ -428,17 +480,19 @@ def encode_video_frames( video_path.parent.mkdir(parents=True, exist_ok=True) # Get input frames - template = "frame-" + ("[0-9]" * 6) + ".png" + is_depth = isinstance(video_encoder, DepthEncoderConfig) + suffix = ".png" if not is_depth else ".tiff" + template = "frame-" + ("[0-9]" * 6) + suffix input_list = sorted( glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0]) ) if len(input_list) == 0: - raise FileNotFoundError(f"No images found in {imgs_dir}.") + raise FileNotFoundError(f"No images with suffix {suffix} found in {imgs_dir}.") with Image.open(input_list[0]) as dummy_image: width, height = dummy_image.size - video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True) + video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True) # Set logging level if log_level is not None: @@ -455,8 +509,19 @@ def encode_video_frames( # Loop through input frames and encode them for input_data in input_list: with Image.open(input_data) as input_image: - input_image = input_image.convert("RGB") - input_frame = av.VideoFrame.from_image(input_image) + if is_depth: + input_frame = quantize_depth( + np.array(input_image), + depth_min=video_encoder.depth_min, + depth_max=video_encoder.depth_max, + shift=video_encoder.shift, + use_log=video_encoder.use_log, + pix_fmt=video_encoder.pix_fmt, + video_backend="pyav", + ) + else: + input_image = input_image.convert("RGB") + input_frame = av.VideoFrame.from_image(input_image) packet = output_stream.encode(input_frame) if packet: output.mux(packet) @@ -477,7 +542,7 @@ def encode_video_frames( def reencode_video( input_video_path: Path | str, output_video_path: Path | str, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, log_level: int | None = av.logging.WARNING, overwrite: bool = False, @@ -489,7 +554,7 @@ def reencode_video( Args: input_video_path: Existing video file to read. output_video_path: Path for the re-encoded file. - camera_encoder: Encoder configuration. Defaults to :func:`camera_encoder_defaults`. + video_encoder: Encoder configuration. Defaults to :func:`rgb_encoder_defaults`. encoder_threads: Optional thread count forwarded to :meth:`VideoEncoderConfig.get_codec_options`. log_level: libav log level while encoding, or ``None`` to leave logging unchanged. Defaults to WARNING. overwrite: When ``False`` and ``output_video_path`` already exists, skip and log a warning. @@ -497,7 +562,7 @@ def reencode_video( end_time_s: When set, trim the output to end at this timestamp (seconds, exclusive). """ - camera_encoder = camera_encoder or camera_encoder_defaults() + video_encoder = video_encoder or rgb_encoder_defaults() if (start_time_s is not None and start_time_s < 0) or (end_time_s is not None and end_time_s < 0): raise ValueError(f"Trim times must be non-negative, got start={start_time_s}, end={end_time_s}.") @@ -512,9 +577,9 @@ def reencode_video( output_video_path.parent.mkdir(parents=True, exist_ok=True) - video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True) - vcodec = camera_encoder.vcodec - pix_fmt = camera_encoder.pix_fmt + video_options = video_encoder.get_codec_options(encoder_threads, as_strings=True) + vcodec = video_encoder.vcodec + pix_fmt = video_encoder.pix_fmt with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_named_file: tmp_output_video_path = tmp_named_file.name @@ -696,22 +761,21 @@ class _CameraEncoderThread(threading.Thread): self, video_path: Path, fps: int, - vcodec: str, - pix_fmt: str, - codec_options: dict[str, str], + video_encoder: VideoEncoderConfig, frame_queue: queue.Queue, result_queue: queue.Queue, stop_event: threading.Event, + encoder_threads: int | None = None, ): super().__init__(daemon=True) self.video_path = video_path self.fps = fps - self.vcodec = vcodec - self.pix_fmt = pix_fmt - self.codec_options = codec_options + self.video_encoder = video_encoder + self.is_depth = isinstance(video_encoder, DepthEncoderConfig) self.frame_queue = frame_queue self.result_queue = result_queue self.stop_event = stop_event + self.encoder_threads = encoder_threads def run(self) -> None: from .compute_stats import RunningQuantileStats, auto_downsample_height_width @@ -736,12 +800,12 @@ class _CameraEncoderThread(threading.Thread): # Sentinel: flush and close break - # Ensure HWC uint8 numpy array + # Ensure HWC (RGB or depth) uint8 (RGB only) numpy array if isinstance(frame_data, np.ndarray): - if frame_data.ndim == 3 and frame_data.shape[0] == 3: + if frame_data.ndim == 3 and frame_data.shape[0] in (1, 3): # CHW -> HWC frame_data = frame_data.transpose(1, 2, 0) - if frame_data.dtype != np.uint8: + if not self.is_depth and frame_data.dtype != np.uint8: frame_data = (frame_data * 255).astype(np.uint8) # Open container on first frame (to get width/height) @@ -749,15 +813,29 @@ class _CameraEncoderThread(threading.Thread): height, width = frame_data.shape[:2] Path(self.video_path).parent.mkdir(parents=True, exist_ok=True) container = av.open(str(self.video_path), "w") - output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options) - output_stream.pix_fmt = self.pix_fmt + output_stream = container.add_stream( + self.video_encoder.vcodec, + self.fps, + options=self.video_encoder.get_codec_options(self.encoder_threads, as_strings=True), + ) + output_stream.pix_fmt = self.video_encoder.pix_fmt output_stream.width = width output_stream.height = height output_stream.time_base = Fraction(1, self.fps) # Encode frame with explicit timestamps - pil_img = Image.fromarray(frame_data) - video_frame = av.VideoFrame.from_image(pil_img) + if not self.is_depth: + pil_img = Image.fromarray(frame_data) + video_frame = av.VideoFrame.from_image(pil_img) + else: + video_frame = quantize_depth( + frame_data, + depth_min=self.video_encoder.depth_min, + depth_max=self.video_encoder.depth_max, + shift=self.video_encoder.shift, + use_log=self.video_encoder.use_log, + video_backend=self.video_encoder.video_backend, + ) video_frame.pts = frame_count video_frame.time_base = Fraction(1, self.fps) packet = output_stream.encode(video_frame) @@ -815,22 +893,27 @@ class StreamingVideoEncoder: def __init__( self, fps: int, - camera_encoder: VideoEncoderConfig | None = None, + rgb_encoder: RGBEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, queue_maxsize: int = 30, encoder_threads: int | None = None, ): """ Args: fps: Frames per second for the output videos. - camera_encoder: Video encoder settings applied to all cameras. - When ``None``, :func:`camera_encoder_defaults` is used. - encoder_threads: Number of encoder threads (global setting). - ``None`` lets the codec decide. + rgb_encoder: Video encoder settings applied to all RGB cameras. + When ``None``, :func:`rgb_encoder_defaults` is used. + depth_encoder: Video encoder settings applied to all depth cameras, + including the depth quantization parameters. When ``None``, + :func:`depth_encoder_defaults` is used. queue_maxsize: Max frames to buffer per camera before back-pressure drops frames. + encoder_threads: Number of encoder threads (global setting). + ``None`` lets the codec decide. """ self.fps = fps - self._camera_encoder = camera_encoder or camera_encoder_defaults() + self._rgb_encoder = rgb_encoder or rgb_encoder_defaults() + self._depth_encoder = depth_encoder or depth_encoder_defaults() self._encoder_threads = encoder_threads self.queue_maxsize = queue_maxsize @@ -843,18 +926,25 @@ class StreamingVideoEncoder: self._episode_active = False self._closed = False - def start_episode(self, video_keys: list[str], temp_dir: Path) -> None: + def start_episode( + self, video_keys: list[str], temp_dir: Path, depth_video_keys: list[str] | None = None + ) -> None: """Start encoder threads for a new episode. Args: video_keys: List of video feature keys (e.g. ["observation.images.laptop"]) temp_dir: Base directory for temporary MP4 files + depth_video_keys: List of video or image feature keys that carry depth maps (e.g. + ["observation.images.laptop_depth"]). Defaults to ``[]`` (no depth keys). """ if self._episode_active: self.cancel_episode() self._dropped_frames.clear() + if depth_video_keys is None: + depth_video_keys = [] + for video_key in video_keys: frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize) result_queue: queue.Queue = queue.Queue(maxsize=1) @@ -863,17 +953,15 @@ class StreamingVideoEncoder: temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir)) video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4" - vcodec = self._camera_encoder.vcodec - codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True) + encoder = self._depth_encoder if video_key in depth_video_keys else self._rgb_encoder encoder_thread = _CameraEncoderThread( video_path=video_path, fps=self.fps, - vcodec=vcodec, - pix_fmt=self._camera_encoder.pix_fmt, - codec_options=codec_options, + video_encoder=encoder, frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, + encoder_threads=self._encoder_threads, ) encoder_thread.start() @@ -1080,15 +1168,23 @@ def get_audio_info(video_path: Path | str) -> dict: def get_video_info( video_path: Path | str, - camera_encoder: VideoEncoderConfig | None = None, + video_encoder: VideoEncoderConfig | None = None, ) -> dict: """Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``. Args: video_path: Path to the encoded video file to probe. - camera_encoder: If provided, record the exact encoder settings used to encode this + video_encoder: If provided, record the exact encoder settings used to encode this video. Stream-derived values take precedence — encoder fields are only written for keys - not already populated from the video file itself. + not already populated from the video file itself. When a + :class:`~lerobot.configs.video.DepthEncoderConfig` is passed, the depth + quantization parameters (``depth_min`` / ``depth_max`` / ``shift`` / + ``use_log``) are recorded so frames can be dequantized on read. + + Returns: + The ``video.*`` / ``audio.*`` info dict, including ``is_depth_map`` which is + ``True`` only when ``video_encoder`` is a + :class:`~lerobot.configs.video.DepthEncoderConfig`. """ logging.getLogger("libav").setLevel(av.logging.WARNING) @@ -1106,13 +1202,10 @@ def get_video_info( video_info["video.width"] = video_stream.width video_info["video.codec"] = video_stream.codec.canonical_name video_info["video.pix_fmt"] = video_stream.pix_fmt - video_info["video.is_depth_map"] = False # Calculate fps from r_frame_rate video_info["video.fps"] = int(video_stream.base_rate) - - pixel_channels = get_video_pixel_channels(video_stream.pix_fmt) - video_info["video.channels"] = pixel_channels + video_info["video.channels"] = get_pix_fmt_channels(video_stream.pix_fmt) # Reset logging level av.logging.restore_default_callback() @@ -1121,27 +1214,18 @@ def get_video_info( video_info.update(**get_audio_info(video_path)) # Add additional encoder configuration if provided - if camera_encoder is not None: - for field_name, field_value in asdict(camera_encoder).items(): + if video_encoder is not None: + for field_name, field_value in asdict(video_encoder).items(): # vcodec is already populated from the video stream if field_name == "vcodec": continue video_info.setdefault(f"video.{field_name}", field_value) + video_info["is_depth_map"] = isinstance(video_encoder, DepthEncoderConfig) + return video_info -def get_video_pixel_channels(pix_fmt: str) -> int: - if "gray" in pix_fmt or "depth" in pix_fmt or "monochrome" in pix_fmt: - return 1 - elif "rgba" in pix_fmt or "yuva" in pix_fmt: - return 4 - elif "rgb" in pix_fmt or "yuv" in pix_fmt: - return 3 - else: - raise ValueError("Unknown format") - - def get_video_duration_in_s(video_path: Path | str) -> float: """ Get the duration of a video file in seconds using PyAV. @@ -1202,10 +1286,13 @@ class VideoEncodingManager: img_dir = self.dataset.root / "images" if img_dir.exists(): png_files = list(img_dir.rglob("*.png")) - if len(png_files) == 0: + tiff_files = list(img_dir.rglob("*.tiff")) + if len(png_files) == 0 and len(tiff_files) == 0: shutil.rmtree(img_dir) logger.debug("Cleaned up empty images directory") else: - logger.debug(f"Images directory is not empty, containing {len(png_files)} PNG files") + logger.debug( + f"Images directory is not empty, containing {len(png_files)} PNG and {len(tiff_files)} TIFF files" + ) return False # Don't suppress the original exception diff --git a/src/lerobot/policies/utils.py b/src/lerobot/policies/utils.py index c37127813..f465fcff8 100644 --- a/src/lerobot/policies/utils.py +++ b/src/lerobot/policies/utils.py @@ -126,7 +126,8 @@ def prepare_observation_for_inference( for name in observation: observation[name] = torch.from_numpy(observation[name]) if "image" in name: - observation[name] = observation[name].type(torch.float32) / 255 + if observation[name].dtype == torch.uint8: + observation[name] = observation[name].type(torch.float32) / 255 observation[name] = observation[name].permute(2, 0, 1).contiguous() observation[name] = observation[name].unsqueeze(0) observation[name] = observation[name].to(device) diff --git a/src/lerobot/robots/hope_jr/hope_jr_arm.py b/src/lerobot/robots/hope_jr/hope_jr_arm.py index 4918bcae3..b606a4fe7 100644 --- a/src/lerobot/robots/hope_jr/hope_jr_arm.py +++ b/src/lerobot/robots/hope_jr/hope_jr_arm.py @@ -66,9 +66,14 @@ class HopeJrArm(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -139,10 +144,17 @@ class HopeJrArm(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/hope_jr/hope_jr_hand.py b/src/lerobot/robots/hope_jr/hope_jr_hand.py index 566628724..ce70e7e13 100644 --- a/src/lerobot/robots/hope_jr/hope_jr_hand.py +++ b/src/lerobot/robots/hope_jr/hope_jr_hand.py @@ -102,9 +102,14 @@ class HopeJrHand(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -170,10 +175,17 @@ class HopeJrHand(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/koch_follower/koch_follower.py b/src/lerobot/robots/koch_follower/koch_follower.py index 3f40ac738..de6f9c4a3 100644 --- a/src/lerobot/robots/koch_follower/koch_follower.py +++ b/src/lerobot/robots/koch_follower/koch_follower.py @@ -68,9 +68,14 @@ class KochFollower(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -192,10 +197,17 @@ class KochFollower(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/lekiwi/lekiwi.py b/src/lerobot/robots/lekiwi/lekiwi.py index b73ebeab9..3712a64d3 100644 --- a/src/lerobot/robots/lekiwi/lekiwi.py +++ b/src/lerobot/robots/lekiwi/lekiwi.py @@ -72,6 +72,12 @@ class LeKiwi(Robot): ) self.arm_motors = [motor for motor in self.bus.motors if motor.startswith("arm")] self.base_motors = [motor for motor in self.bus.motors if motor.startswith("base")] + depth_cameras = [name for name, cfg in config.cameras.items() if getattr(cfg, "use_depth", False)] + if depth_cameras: + raise NotImplementedError( + f"Depth cameras are not supported on LeKiwi (got depth-enabled cameras: {depth_cameras}). " + "The host/client transport only carries color frames." + ) self.cameras = make_cameras_from_configs(config.cameras) @property diff --git a/src/lerobot/robots/lekiwi/lekiwi_client.py b/src/lerobot/robots/lekiwi/lekiwi_client.py index fd43e84fe..1bc3dadc4 100644 --- a/src/lerobot/robots/lekiwi/lekiwi_client.py +++ b/src/lerobot/robots/lekiwi/lekiwi_client.py @@ -44,6 +44,13 @@ class LeKiwiClient(Robot): self.id = config.id self.robot_type = config.type + depth_cameras = [name for name, cfg in config.cameras.items() if getattr(cfg, "use_depth", False)] + if depth_cameras: + raise NotImplementedError( + f"Depth cameras are not supported on LeKiwi (got depth-enabled cameras: {depth_cameras}). " + "The host/client transport only carries color frames." + ) + self.remote_ip = config.remote_ip self.port_zmq_cmd = config.port_zmq_cmd self.port_zmq_observations = config.port_zmq_observations diff --git a/src/lerobot/robots/omx_follower/omx_follower.py b/src/lerobot/robots/omx_follower/omx_follower.py index c30eec97a..b2cfb52e9 100644 --- a/src/lerobot/robots/omx_follower/omx_follower.py +++ b/src/lerobot/robots/omx_follower/omx_follower.py @@ -68,9 +68,14 @@ class OmxFollower(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -175,10 +180,17 @@ class OmxFollower(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/openarm_follower/openarm_follower.py b/src/lerobot/robots/openarm_follower/openarm_follower.py index 020f24052..e2c7c8cf5 100644 --- a/src/lerobot/robots/openarm_follower/openarm_follower.py +++ b/src/lerobot/robots/openarm_follower/openarm_follower.py @@ -101,9 +101,14 @@ class OpenArmFollower(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: """Camera features for observation space.""" - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -242,10 +247,17 @@ class OpenArmFollower(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") dt_ms = (time.perf_counter() - start) * 1e3 logger.debug(f"{self} get_observation took: {dt_ms:.1f}ms") diff --git a/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py b/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py index ec00f4aa9..bf989702b 100644 --- a/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py +++ b/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py @@ -80,9 +80,14 @@ class RebotB601Follower(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -213,10 +218,17 @@ class RebotB601Follower(Robot): logger.debug(f"{self} read state: {dt_ms:.1f}ms") for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/so_follower/so_follower.py b/src/lerobot/robots/so_follower/so_follower.py index 0651f566c..c6e67fafe 100644 --- a/src/lerobot/robots/so_follower/so_follower.py +++ b/src/lerobot/robots/so_follower/so_follower.py @@ -68,9 +68,13 @@ class SOFollower(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + if getattr(self.cameras[cam], "use_rgb", True): + features[cam] = (self.cameras[cam].height, self.cameras[cam].width, 3) + if getattr(self.cameras[cam], "use_depth", False): + features[f"{cam}_depth"] = (self.cameras[cam].height, self.cameras[cam].width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -185,10 +189,17 @@ class SOFollower(Robot): # Capture images from cameras for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.read_latest() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + if getattr(cam, "use_rgb", True): + start = time.perf_counter() + obs_dict[cam_key] = cam.read_latest() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") + + if getattr(cam, "use_depth", False): + start = time.perf_counter() + obs_dict[f"{cam_key}_depth"] = cam.read_latest_depth() + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read {cam_key} depth: {dt_ms:.1f}ms") return obs_dict diff --git a/src/lerobot/robots/unitree_g1/unitree_g1.py b/src/lerobot/robots/unitree_g1/unitree_g1.py index 25ec32716..5b8be0941 100644 --- a/src/lerobot/robots/unitree_g1/unitree_g1.py +++ b/src/lerobot/robots/unitree_g1/unitree_g1.py @@ -222,9 +222,14 @@ class UnitreeG1(Robot): @property def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } + features: dict[str, tuple] = {} + for cam in self.cameras: + cfg = self.config.cameras[cam] + if getattr(cfg, "use_rgb", True): + features[cam] = (cfg.height, cfg.width, 3) + if getattr(cfg, "use_depth", False): + features[f"{cam}_depth"] = (cfg.height, cfg.width, 1) + return features @cached_property def observation_features(self) -> dict[str, type | tuple]: @@ -458,7 +463,10 @@ class UnitreeG1(Robot): # Cameras - read images from ZMQ cameras for cam_name, cam in self._cameras.items(): - obs[cam_name] = cam.read_latest() + if getattr(cam, "use_rgb", True): + obs[cam_name] = cam.read_latest() + if getattr(cam, "use_depth", False): + obs[f"{cam_name}_depth"] = cam.read_latest_depth() return obs diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py index bf5fa0fd4..62d844932 100644 --- a/src/lerobot/rollout/context.py +++ b/src/lerobot/rollout/context.py @@ -332,7 +332,8 @@ def build_rollout_context( cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, - camera_encoder=cfg.dataset.camera_encoder, + rgb_encoder=cfg.dataset.rgb_encoder, + depth_encoder=cfg.dataset.depth_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads, @@ -367,7 +368,8 @@ def build_rollout_context( image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras if hasattr(robot, "cameras") else []), batch_encoding_size=cfg.dataset.video_encoding_batch_size, - camera_encoder=cfg.dataset.camera_encoder, + rgb_encoder=cfg.dataset.rgb_encoder, + depth_encoder=cfg.dataset.depth_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads, diff --git a/src/lerobot/scripts/lerobot_dataset_viz.py b/src/lerobot/scripts/lerobot_dataset_viz.py index d07a2767d..21ae1ac9d 100644 --- a/src/lerobot/scripts/lerobot_dataset_viz.py +++ b/src/lerobot/scripts/lerobot_dataset_viz.py @@ -77,15 +77,28 @@ from lerobot.utils.constants import ACTION, DONE, OBS_STATE, REWARD from lerobot.utils.utils import init_logging +def check_chw_float32(frame: torch.Tensor) -> None: + """ + Check if a frame is a channel-first, float32 tensor. + """ + assert frame.dtype == torch.float32 + assert frame.ndim == 3 + c, h, w = frame.shape + assert c < h and c < w, f"expect channel first images, but instead {frame.shape}" + + def to_hwc_uint8_numpy(chw_float32_torch: torch.Tensor) -> np.ndarray: - assert chw_float32_torch.dtype == torch.float32 - assert chw_float32_torch.ndim == 3 - c, h, w = chw_float32_torch.shape - assert c < h and c < w, f"expect channel first images, but instead {chw_float32_torch.shape}" + check_chw_float32(chw_float32_torch) hwc_uint8_numpy = (chw_float32_torch * 255).type(torch.uint8).permute(1, 2, 0).numpy() return hwc_uint8_numpy +def to_hwc_uint16_numpy(chw_float32_torch: torch.Tensor) -> np.ndarray: + check_chw_float32(chw_float32_torch) + hwc_uint16_numpy = chw_float32_torch.round().type(torch.uint16).permute(1, 2, 0).numpy() + return hwc_uint16_numpy + + def visualize_dataset( dataset: LeRobotDataset, episode_index: int, @@ -138,6 +151,14 @@ def visualize_dataset( logging.info("Logging to Rerun") + # Use the dataset's q01/q99 depth statistics for robust depth range bounds + depth_ranges = {} + for key in dataset.meta.depth_keys: + stats = dataset.meta.stats[key] + lo = stats["q01"] if "q01" in stats else stats["min"] + hi = stats["q99"] if "q99" in stats else stats["max"] + depth_ranges[key] = (float(np.asarray(lo).item()), float(np.asarray(hi).item())) + first_index = None for batch in tqdm.tqdm(dataloader, total=len(dataloader)): if first_index is None: @@ -149,9 +170,18 @@ def visualize_dataset( # display each camera image for key in dataset.meta.camera_keys: - img = to_hwc_uint8_numpy(batch[key][i]) - img_entity = rr.Image(img).compress() if display_compressed_images else rr.Image(img) - rr.log(key, entity=img_entity) + if key in dataset.meta.depth_keys: + depth = to_hwc_uint16_numpy(batch[key][i]) + depth_entity = rr.DepthImage( + depth, + colormap=rr.components.Colormap.Viridis, + depth_range=depth_ranges[key], + ) + rr.log(key, entity=depth_entity) + else: + img = to_hwc_uint8_numpy(batch[key][i]) + img_entity = rr.Image(img).compress() if display_compressed_images else rr.Image(img) + rr.log(key, entity=img_entity) # display each dimension of action space (e.g. actuators command) if ACTION in batch: diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index eaadf47de..42dce438f 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -133,6 +133,15 @@ Convert image dataset to video format and save locally: --new_root /path/to/output/pusht_video \ --operation.type convert_image_to_video +Convert image dataset (with depth maps) to video format, customizing the depth encoder: + lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --new_root /path/to/output/pusht_video \ + --operation.type convert_image_to_video \ + --operation.depth_encoder.depth_min 0.01 \ + --operation.depth_encoder.depth_max 10.0 \ + --operation.depth_encoder.use_log true + Convert image dataset to video format and save with new repo_id: lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ @@ -190,17 +199,17 @@ Re-encode all videos in a dataset (saves to lerobot/pusht_reencoded by default): lerobot-edit-dataset \ --repo_id lerobot/pusht \ --operation.type reencode_videos \ - --operation.camera_encoder.vcodec h264 \ - --operation.camera_encoder.pix_fmt yuv420p \ - --operation.camera_encoder.crf 23 + --operation.rgb_encoder.vcodec h264 \ + --operation.rgb_encoder.pix_fmt yuv420p \ + --operation.rgb_encoder.crf 23 Re-encode videos into a new dataset using 4 parallel processes: lerobot-edit-dataset \ --repo_id lerobot/pusht \ --new_repo_id lerobot/pusht_h264 \ --operation.type reencode_videos \ - --operation.camera_encoder.vcodec h264 \ - --operation.camera_encoder.crf 23 \ + --operation.rgb_encoder.vcodec h264 \ + --operation.rgb_encoder.crf 23 \ --operation.num_workers 4 Re-encode videos in-place (overwrites original dataset): @@ -208,9 +217,16 @@ Re-encode videos in-place (overwrites original dataset): --repo_id lerobot/pusht \ --new_repo_id lerobot/pusht \ --operation.type reencode_videos \ - --operation.camera_encoder.vcodec h264 \ + --operation.rgb_encoder.vcodec h264 \ --operation.overwrite true +Re-encode both RGB and depth videos in a dataset (depth quantization params are preserved): + lerobot-edit-dataset \ + --repo_id lerobot/pusht_depth \ + --operation.type reencode_videos \ + --operation.rgb_encoder.vcodec h264 \ + --operation.depth_encoder.extra_options '{"x265-params": "lossless=1"}' + Using JSON config file: lerobot-edit-dataset \ --config_path path/to/edit_config.json @@ -225,7 +241,13 @@ from pathlib import Path import draccus -from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser +from lerobot.configs import ( + DepthEncoderConfig, + RGBEncoderConfig, + depth_encoder_defaults, + parser, + rgb_encoder_defaults, +) from lerobot.datasets import ( LeRobotDataset, convert_image_to_video_dataset, @@ -287,7 +309,8 @@ class ModifyTasksConfig(OperationConfig): @dataclass class ConvertImageToVideoConfig(OperationConfig): output_dir: str | None = None - camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + rgb_encoder: RGBEncoderConfig = field(default_factory=rgb_encoder_defaults) + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) episode_indices: list[int] | None = None num_workers: int = 4 max_episodes_per_batch: int | None = None @@ -308,7 +331,8 @@ class RecomputeStatsConfig(OperationConfig): @OperationConfig.register_subclass("reencode_videos") @dataclass class ReencodeVideosConfig(OperationConfig): - camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + rgb_encoder: RGBEncoderConfig = field(default_factory=rgb_encoder_defaults) + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) num_workers: int = 0 encoder_threads: int | None = None overwrite: bool = False @@ -601,7 +625,8 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: dataset=dataset, output_dir=output_dir, repo_id=output_repo_id, - camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(), + rgb_encoder=getattr(cfg.operation, "rgb_encoder", None) or rgb_encoder_defaults(), + depth_encoder=getattr(cfg.operation, "depth_encoder", None) or depth_encoder_defaults(), episode_indices=getattr(cfg.operation, "episode_indices", None), num_workers=getattr(cfg.operation, "num_workers", 4), max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), @@ -719,10 +744,14 @@ def handle_reencode_videos(cfg: EditDatasetConfig) -> None: shutil.copytree(input_root, output_root) dataset = LeRobotDataset(output_repo_id, root=output_root) - logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}") + logging.info( + f"Re-encoding videos in {output_repo_id} with RGB encoder {cfg.operation.rgb_encoder} " + f"and depth encoder {cfg.operation.depth_encoder}" + ) reencode_dataset( dataset, - camera_encoder=cfg.operation.camera_encoder, + rgb_encoder=cfg.operation.rgb_encoder, + depth_encoder=cfg.operation.depth_encoder, encoder_threads=cfg.operation.encoder_threads, num_workers=cfg.operation.num_workers, ) diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py index 4d5518c7c..b759d86e0 100644 --- a/src/lerobot/scripts/lerobot_record.py +++ b/src/lerobot/scripts/lerobot_record.py @@ -79,9 +79,9 @@ lerobot-record \\ --dataset.single_task="Grab the cube" \\ --dataset.streaming_encoding=true \\ --dataset.encoder_threads=2 \\ - --dataset.camera_encoder.vcodec=h264 \\ - --dataset.camera_encoder.preset=fast \\ - --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\ + --dataset.rgb_encoder.vcodec=h264 \\ + --dataset.rgb_encoder.preset=fast \\ + --dataset.rgb_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\ --display_data=true ``` """ @@ -400,7 +400,8 @@ def record( cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, - camera_encoder=cfg.dataset.camera_encoder, + rgb_encoder=cfg.dataset.rgb_encoder, + depth_encoder=cfg.dataset.depth_encoder, encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, @@ -429,7 +430,8 @@ def record( image_writer_processes=cfg.dataset.num_image_writer_processes, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras), batch_encoding_size=cfg.dataset.video_encoding_batch_size, - camera_encoder=cfg.dataset.camera_encoder, + rgb_encoder=cfg.dataset.rgb_encoder, + depth_encoder=cfg.dataset.depth_encoder, encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, @@ -443,7 +445,7 @@ def record( if not cfg.dataset.streaming_encoding: logging.info( - "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" + "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.rgb_encoder.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" ) with VideoEncodingManager(dataset): diff --git a/src/lerobot/scripts/lerobot_rollout.py b/src/lerobot/scripts/lerobot_rollout.py index 8515c4cc9..daee87bbe 100644 --- a/src/lerobot/scripts/lerobot_rollout.py +++ b/src/lerobot/scripts/lerobot_rollout.py @@ -142,9 +142,9 @@ Usage examples --robot.port=/dev/ttyACM0 \\ --task="pick up cube" --duration=60 \\ --display_data=true \\ - --dataset.camera_encoder.vcodec=h264 \\ - --dataset.camera_encoder.preset=fast \\ - --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} + --dataset.rgb_encoder.vcodec=h264 \\ + --dataset.rgb_encoder.preset=fast \\ + --dataset.rgb_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} """ import logging diff --git a/src/lerobot/utils/feature_utils.py b/src/lerobot/utils/feature_utils.py index 2a4886234..38516d6ab 100644 --- a/src/lerobot/utils/feature_utils.py +++ b/src/lerobot/utils/feature_utils.py @@ -51,7 +51,9 @@ def hw_to_dataset_features( This function takes a dictionary describing hardware outputs (like joint states or camera image shapes) and formats it into the standard LeRobot feature - specification. + specification. Single-channel cameras (shape ``(H, W, 1)``) are flagged as depth + maps via ``info["is_depth_map"] = True``; three-channel cameras ``(H, W, 3)`` are + treated as RGB. Args: hw_features (dict): Dictionary mapping feature names to their type (float for @@ -61,7 +63,7 @@ def hw_to_dataset_features( use_video (bool): If True, image features are marked as "video", otherwise "image". Returns: - dict: A LeRobot features dictionary. + dict: A LeRobot features dictionary. Depth cameras carry ``info["is_depth_map"] = True``. """ features = {} joint_fts = { @@ -69,6 +71,7 @@ def hw_to_dataset_features( for key, ftype in hw_features.items() if ftype is float or (isinstance(ftype, PolicyFeature) and ftype.type != FeatureType.VISUAL) } + # TODO(CarolinePascal): we should not rely on the shape to determine if a feature is a camera ! cam_fts = {key: shape for key, shape in hw_features.items() if isinstance(shape, tuple)} if joint_fts and prefix == ACTION: @@ -86,11 +89,19 @@ def hw_to_dataset_features( } for key, shape in cam_fts.items(): - features[f"{prefix}.images.{key}"] = { - "dtype": "video" if use_video else "image", - "shape": shape, - "names": ["height", "width", "channels"], - } + dtype = "video" if use_video else "image" + if len(shape) == 3 and shape[2] in (1, 3): + features[f"{prefix}.images.{key}"] = { + "dtype": dtype, + "shape": shape, + "names": ["height", "width", "channels"], + "info": {"is_depth_map": shape[2] == 1}, + } + else: + raise ValueError( + f"Camera feature '{key}' has shape {shape}. " + f"Expected a 3-tuple (H, W, C), e.g. (480, 640, 3) for RGB or (480, 640, 1) for depth." + ) _validate_feature_names(features) return features @@ -149,11 +160,11 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea type = FeatureType.VISUAL if len(shape) != 3: raise ValueError(f"Number of dimensions of {key} != 3 (shape={shape})") - - names = ft["names"] - # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets. - if names[2] in ["channel", "channels"]: # (h, w, c) -> (c, h, w) - shape = (shape[2], shape[0], shape[1]) + else: + names = ft["names"] + # Backward compatibility for "channel" which is an error introduced in LeRobotDataset v2.0 for ported datasets. + if names[2] in ["channel", "channels"]: # (h, w, c) -> (c, h, w) + shape = (shape[2], shape[0], shape[1]) elif key == OBS_ENV_STATE: type = FeatureType.ENV elif key.startswith(OBS_STR): diff --git a/src/lerobot/utils/visualization_utils.py b/src/lerobot/utils/visualization_utils.py index d9d5bf6b5..e039f7b33 100644 --- a/src/lerobot/utils/visualization_utils.py +++ b/src/lerobot/utils/visualization_utils.py @@ -107,7 +107,10 @@ def log_rerun_data( for i, vi in enumerate(arr): rr.log(f"{key}_{i}", rr.Scalars(float(vi))) else: - img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr) + if arr.shape[-1] == 1: + img_entity = rr.DepthImage(arr, colormap=rr.components.Colormap.Viridis) + else: + img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr) rr.log(key, entity=img_entity, static=True) if action: diff --git a/tests/annotations/test_frames.py b/tests/annotations/test_frames.py index 5c9c58f7b..1a626533f 100644 --- a/tests/annotations/test_frames.py +++ b/tests/annotations/test_frames.py @@ -47,6 +47,7 @@ class _FakeMeta: def __init__(self, video_keys: list[str], image_keys: list[str], video_path: Path | None = None) -> None: self.video_keys = video_keys self.camera_keys = [*video_keys, *image_keys] + self.depth_keys = [] self._video_path = video_path self.episodes = {0: {f"videos/{key}/from_timestamp": 0.0 for key in video_keys}} @@ -208,14 +209,14 @@ def test_episode_clip_path_trims_via_reencode_video(tmp_path: Path, monkeypatch) def fake_reencode( input_video_path, output_video_path, - camera_encoder=None, + video_encoder=None, overwrite=False, start_time_s=None, end_time_s=None, ): captured.update( src=Path(input_video_path), - encoder=camera_encoder, + encoder=video_encoder, start_time_s=start_time_s, end_time_s=end_time_s, ) diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py index e9930575f..2fafd2777 100644 --- a/tests/datasets/test_aggregate.py +++ b/tests/datasets/test_aggregate.py @@ -29,7 +29,10 @@ from lerobot.configs import VIDEO_ENCODER_INFO_KEYS from lerobot.datasets.aggregate import aggregate_datasets from lerobot.datasets.feature_utils import features_equal_for_merge from lerobot.datasets.lerobot_dataset import LeRobotDataset -from tests.fixtures.constants import DUMMY_REPO_ID +from tests.fixtures.constants import ( + DUMMY_CAMERA_FEATURES_WITH_DEPTH, + DUMMY_REPO_ID, +) def assert_data_shards_one_row_group_per_episode(root): @@ -211,6 +214,26 @@ def assert_dataset_iteration_works(aggr_ds): pass +def assert_depth_keys_preserved(aggr_ds, ds_0, ds_1): + """Test that depth keys are correctly preserved after aggregation. + + Ensures that the ``is_depth_map`` marker on visual features survives + aggregation, so that downstream consumers (e.g. the dataset reader's + depth decoding path) keep working on the merged dataset. + """ + expected_depth_keys = set(ds_0.meta.depth_keys) + assert expected_depth_keys == set(ds_1.meta.depth_keys), ( + "Source datasets disagree on depth_keys; test setup is inconsistent" + ) + actual_depth_keys = set(aggr_ds.meta.depth_keys) + assert actual_depth_keys == expected_depth_keys, ( + f"Expected depth_keys {expected_depth_keys}, got {actual_depth_keys}" + ) + for key in expected_depth_keys: + info = aggr_ds.meta.info.features[key].get("info") or {} + assert info.get("is_depth_map") is True, f"Depth marker lost on feature {key!r} after aggregation" + + def assert_video_timestamps_within_bounds(aggr_ds): """Test that all video timestamps are within valid bounds for their respective video files. @@ -260,7 +283,11 @@ def assert_video_timestamps_within_bounds(aggr_ds): def test_aggregate_datasets(tmp_path, lerobot_dataset_factory): - """Test basic aggregation functionality with standard parameters.""" + """Test basic aggregation functionality with standard parameters. + + Source datasets include both RGB and depth video features so the same + aggregation flow is exercised on the ``is_depth_map`` branch. + """ ds_0_num_frames = 400 ds_1_num_frames = 800 ds_0_num_episodes = 10 @@ -272,14 +299,21 @@ def test_aggregate_datasets(tmp_path, lerobot_dataset_factory): repo_id=f"{DUMMY_REPO_ID}_0", total_episodes=ds_0_num_episodes, total_frames=ds_0_num_frames, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) ds_1 = lerobot_dataset_factory( root=tmp_path / "test_1", repo_id=f"{DUMMY_REPO_ID}_1", total_episodes=ds_1_num_episodes, total_frames=ds_1_num_frames, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) + # Confirm depth was actually wired into the source datasets so the + # rest of the assertions exercise the depth aggregation path. + assert len(ds_0.meta.depth_keys) > 0, "ds_0 should expose at least one depth key" + assert len(ds_1.meta.depth_keys) > 0, "ds_1 should expose at least one depth key" + aggregate_datasets( repo_ids=[ds_0.repo_id, ds_1.repo_id], roots=[ds_0.root, ds_1.root], @@ -306,6 +340,7 @@ def test_aggregate_datasets(tmp_path, lerobot_dataset_factory): assert_episode_indices_updated_correctly(aggr_ds, ds_0, ds_1) assert_video_frames_integrity(aggr_ds, ds_0, ds_1) assert_video_timestamps_within_bounds(aggr_ds) + assert_depth_keys_preserved(aggr_ds, ds_0, ds_1) assert_dataset_iteration_works(aggr_ds) @@ -423,7 +458,11 @@ def test_aggregate_incomplete_video_encoder_info_warns_and_nuls_encoders( def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory): - """Test aggregation with small file size limits to force file rotation/sharding.""" + """Test aggregation with small file size limits to force file rotation/sharding. + + Depth video features are included to verify that file rotation/concat + correctly handles depth-marked features alongside regular RGB ones. + """ ds_0_num_episodes = ds_1_num_episodes = 10 ds_0_num_frames = ds_1_num_frames = 400 @@ -432,14 +471,19 @@ def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory): repo_id=f"{DUMMY_REPO_ID}_small_0", total_episodes=ds_0_num_episodes, total_frames=ds_0_num_frames, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) ds_1 = lerobot_dataset_factory( root=tmp_path / "small_1", repo_id=f"{DUMMY_REPO_ID}_small_1", total_episodes=ds_1_num_episodes, total_frames=ds_1_num_frames, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) + assert len(ds_0.meta.depth_keys) > 0, "ds_0 should expose at least one depth key" + assert len(ds_1.meta.depth_keys) > 0, "ds_1 should expose at least one depth key" + # Use the new configurable parameters to force file rotation aggregate_datasets( repo_ids=[ds_0.repo_id, ds_1.repo_id], @@ -470,6 +514,7 @@ def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory): assert_episode_indices_updated_correctly(aggr_ds, ds_0, ds_1) assert_video_frames_integrity(aggr_ds, ds_0, ds_1) assert_video_timestamps_within_bounds(aggr_ds) + assert_depth_keys_preserved(aggr_ds, ds_0, ds_1) assert_dataset_iteration_works(aggr_ds) # Check that multiple files were actually created due to small size limits @@ -489,7 +534,8 @@ def test_video_timestamps_regression(tmp_path, lerobot_dataset_factory): """Regression test for video timestamp bug when merging datasets. This test specifically checks that video timestamps are correctly calculated - and accumulated when merging multiple datasets. + and accumulated when merging multiple datasets. Depth video features are + included so depth timestamps are also covered by the regression. """ datasets = [] for i in range(3): @@ -498,9 +544,13 @@ def test_video_timestamps_regression(tmp_path, lerobot_dataset_factory): repo_id=f"{DUMMY_REPO_ID}_regression_{i}", total_episodes=2, total_frames=100, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) datasets.append(ds) + for i, ds in enumerate(datasets): + assert len(ds.meta.depth_keys) > 0, f"Dataset {i} should expose at least one depth key" + aggregate_datasets( repo_ids=[ds.repo_id for ds in datasets], roots=[ds.root for ds in datasets], @@ -517,12 +567,21 @@ def test_video_timestamps_regression(tmp_path, lerobot_dataset_factory): aggr_ds = LeRobotDataset(f"{DUMMY_REPO_ID}_regression_aggr", root=tmp_path / "regression_aggr") assert_video_timestamps_within_bounds(aggr_ds) + # Depth keys must survive the merge for the regression to cover the + # ``is_depth_map`` decoding branch. + assert set(aggr_ds.meta.depth_keys) == set(datasets[0].meta.depth_keys) + depth_keys = set(aggr_ds.meta.depth_keys) for i in range(len(aggr_ds)): item = aggr_ds[i] for key in aggr_ds.meta.video_keys: assert key in item, f"Video key {key} missing from item {i}" - assert item[key].shape[0] == 3, f"Expected 3 channels for video key {key}" + # Depth frames are single-channel (1, H, W) after dequantization; + # standard RGB frames keep the 3-channel layout. + expected_channels = 1 if key in depth_keys else 3 + assert item[key].shape[0] == expected_channels, ( + f"Expected {expected_channels} channels for video key {key}, got {item[key].shape}" + ) def assert_image_schema_preserved(aggr_ds): @@ -639,25 +698,31 @@ def test_aggregate_image_datasets(tmp_path, lerobot_dataset_factory): ds_0_num_episodes = 2 ds_1_num_episodes = 3 - # Create two image-based datasets (use_videos=False) + # Create two image-based datasets (use_videos=False) with a mix of RGB + # and depth-marked cameras so the depth path is exercised in image mode. ds_0 = lerobot_dataset_factory( root=tmp_path / "image_0", repo_id=f"{DUMMY_REPO_ID}_image_0", total_episodes=ds_0_num_episodes, total_frames=ds_0_num_frames, - use_videos=False, # Image-based dataset + use_videos=False, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) ds_1 = lerobot_dataset_factory( root=tmp_path / "image_1", repo_id=f"{DUMMY_REPO_ID}_image_1", total_episodes=ds_1_num_episodes, total_frames=ds_1_num_frames, - use_videos=False, # Image-based dataset + use_videos=False, + camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, ) # Verify source datasets have image keys assert len(ds_0.meta.image_keys) > 0, "ds_0 should have image keys" assert len(ds_1.meta.image_keys) > 0, "ds_1 should have image keys" + # And that the depth marker actually made it onto an image feature. + assert len(ds_0.meta.depth_keys) > 0, "ds_0 should expose at least one depth key" + assert len(ds_1.meta.depth_keys) > 0, "ds_1 should expose at least one depth key" # Aggregate the datasets aggregate_datasets( @@ -692,6 +757,7 @@ def test_aggregate_image_datasets(tmp_path, lerobot_dataset_factory): # Image-specific assertions assert_image_schema_preserved(aggr_ds) assert_image_frames_integrity(aggr_ds, ds_0, ds_1) + assert_depth_keys_preserved(aggr_ds, ds_0, ds_1) # Verify images can be accessed and have correct shape sample_item = aggr_ds[0] diff --git a/tests/datasets/test_compute_stats.py b/tests/datasets/test_compute_stats.py index 0f5abfb95..9f399b85c 100644 --- a/tests/datasets/test_compute_stats.py +++ b/tests/datasets/test_compute_stats.py @@ -35,7 +35,11 @@ from lerobot.utils.constants import OBS_IMAGE, OBS_STATE def mock_load_image_as_numpy(path, dtype, channel_first): - return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype) + is_depth = "depth" in str(path) + channels = 1 if is_depth else 3 + out_dtype = np.uint16 if is_depth else dtype + arr = np.arange(channels * 32 * 32, dtype=out_dtype).reshape(channels, 32, 32) + return arr if channel_first else arr.transpose(1, 2, 0) @pytest.fixture @@ -168,22 +172,33 @@ def test_get_feature_stats_single_value(): def test_compute_episode_stats(): + depth_key = "observation.images.depth" episode_data = { OBS_IMAGE: [f"image_{i}.jpg" for i in range(100)], + depth_key: [f"depth_{i}.tiff" for i in range(100)], OBS_STATE: np.random.rand(100, 10), } features = { OBS_IMAGE: {"dtype": "image"}, + depth_key: {"dtype": "image", "info": {"is_depth_map": True}}, OBS_STATE: {"dtype": "numeric"}, } with patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy): stats = compute_episode_stats(episode_data, features) - assert OBS_IMAGE in stats and OBS_STATE in stats + assert OBS_IMAGE in stats and depth_key in stats and OBS_STATE in stats assert stats[OBS_IMAGE]["count"].item() == 100 + assert stats[depth_key]["count"].item() == 100 assert stats[OBS_STATE]["count"].item() == 100 assert stats[OBS_IMAGE]["mean"].shape == (3, 1, 1) + assert stats[depth_key]["mean"].shape == (1, 1, 1) + # Depth keeps raw values: max far exceeds 255, proving no /255 and no uint8 downcast. + assert stats[depth_key]["min"].item() == 0.0 + assert stats[depth_key]["max"].item() == 1023.0 + # RGB is normalized to [0, 1]. + np.testing.assert_allclose(stats[OBS_IMAGE]["min"], 0.0) + np.testing.assert_allclose(stats[OBS_IMAGE]["max"], 1.0) def test_assert_type_and_shape_valid(): @@ -618,25 +633,31 @@ def test_compute_episode_stats_with_custom_quantiles(): def test_compute_episode_stats_with_image_data(): """Test quantile computation with image features.""" image_paths = [f"image_{i}.jpg" for i in range(50)] + depth_paths = [f"depth_{i}.tiff" for i in range(50)] episode_data = { "observation.image": image_paths, + "observation.images.depth": depth_paths, "action": np.random.normal(0, 1, (50, 5)), } features = { "observation.image": {"dtype": "image"}, + "observation.images.depth": {"dtype": "image", "info": {"is_depth_map": True}}, "action": {"dtype": "float32", "shape": (5,)}, } with patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy): stats = compute_episode_stats(episode_data, features) - # Image quantiles should be normalized and have correct shape - assert "q01" in stats["observation.image"] - assert "q50" in stats["observation.image"] - assert "q99" in stats["observation.image"] - assert stats["observation.image"]["q01"].shape == (3, 1, 1) - assert stats["observation.image"]["q50"].shape == (3, 1, 1) - assert stats["observation.image"]["q99"].shape == (3, 1, 1) + # RGB image quantiles should be normalized and per-channel. + for q in ("q01", "q50", "q99"): + assert stats["observation.image"][q].shape == (3, 1, 1) + + # Depth quantiles are single-channel and kept in raw (un-normalized) units. + for q in ("q01", "q50", "q99"): + assert stats["observation.images.depth"][q].shape == (1, 1, 1) + # Depth max stays in raw units (not /255, not uint8-capped); RGB is normalized. + assert stats["observation.images.depth"]["max"].item() == 1023.0 + np.testing.assert_allclose(stats["observation.image"]["max"], 1.0) # Action quantiles should have correct shape assert stats["action"]["q01"].shape == (5,) diff --git a/tests/datasets/test_dataset_metadata.py b/tests/datasets/test_dataset_metadata.py index 171d8af8b..a1630f17d 100644 --- a/tests/datasets/test_dataset_metadata.py +++ b/tests/datasets/test_dataset_metadata.py @@ -59,11 +59,13 @@ def _make_dummy_stats(features: dict) -> dict: stats = {} for key, ft in features.items(): if ft["dtype"] in ("image", "video"): + channels = ft["shape"][-1] + stat_shape = (channels, 1, 1) stats[key] = { - "max": np.ones((3, 1, 1), dtype=np.float32), - "mean": np.full((3, 1, 1), 0.5, dtype=np.float32), - "min": np.zeros((3, 1, 1), dtype=np.float32), - "std": np.full((3, 1, 1), 0.25, dtype=np.float32), + "max": np.ones(stat_shape, dtype=np.float32), + "mean": np.full(stat_shape, 0.5, dtype=np.float32), + "min": np.zeros(stat_shape, dtype=np.float32), + "std": np.full(stat_shape, 0.25, dtype=np.float32), "count": np.array([5]), } elif ft["dtype"] in ("float32", "float64", "int64"): @@ -142,6 +144,45 @@ def test_create_without_videos_has_no_video_path(tmp_path): assert meta.video_keys == [] +@pytest.mark.parametrize( + ("marker_field", "marker_key"), + [ + ("info", "is_depth_map"), + ("info", "video.is_depth_map"), + ("video_info", "video.is_depth_map"), + ], + ids=["info.is_depth_map", "info.video.is_depth_map_legacy", "video_info.video.is_depth_map_legacy"], +) +def test_depth_keys_property_filters_by_marker(tmp_path, marker_field, marker_key): + """``depth_keys`` recognises the canonical and the two legacy marker variants.""" + depth_feature = { + "dtype": "video", + "shape": (64, 96, 1), + "names": ["height", "width", "channels"], + marker_field: {marker_key: True}, + } + features = { + **VIDEO_FEATURES, + "observation.images.laptop_depth": depth_feature, + } + meta = LeRobotDatasetMetadata.create( + repo_id="test/depth_keys", + fps=DEFAULT_FPS, + features=features, + root=tmp_path / f"depth_keys_{marker_field}_{marker_key.replace('.', '_')}", + ) + + assert set(meta.video_keys) == {"observation.images.laptop", "observation.images.laptop_depth"} + assert meta.depth_keys == ["observation.images.laptop_depth"] + + +def test_depth_keys_empty_when_no_marker(tmp_path): + meta = LeRobotDatasetMetadata.create( + repo_id="test/no_depth", fps=DEFAULT_FPS, features=VIDEO_FEATURES, root=tmp_path / "no_depth" + ) + assert meta.depth_keys == [] + + def test_create_raises_on_existing_directory(tmp_path): """create() raises if root directory already exists.""" root = tmp_path / "existing" diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py index d36312920..c19e7f41f 100644 --- a/tests/datasets/test_dataset_tools.py +++ b/tests/datasets/test_dataset_tools.py @@ -24,7 +24,7 @@ import torch pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])") -from lerobot.configs import VideoEncoderConfig +from lerobot.configs import DepthEncoderConfig, RGBEncoderConfig from lerobot.datasets.dataset_tools import ( add_features, convert_image_to_video_dataset, @@ -37,7 +37,9 @@ from lerobot.datasets.dataset_tools import ( split_dataset, ) from lerobot.datasets.io_utils import load_info -from tests.datasets.test_video_encoding import _add_frames, require_h264, require_libsvtav1 +from tests.datasets.test_video_encoding import require_h264, require_hevc, require_libsvtav1 +from tests.fixtures.constants import DUMMY_DEPTH_FEATURES, DUMMY_DEPTH_KEY +from tests.fixtures.dataset_factories import add_frames @pytest.fixture @@ -1251,7 +1253,7 @@ def test_convert_image_to_video_dataset(tmp_path): dataset=source_dataset, output_dir=output_dir, repo_id="lerobot/pusht_video", - camera_encoder=VideoEncoderConfig( + rgb_encoder=RGBEncoderConfig( vcodec="libsvtav1", pix_fmt="yuv420p", g=2, @@ -1332,9 +1334,131 @@ def test_convert_image_to_video_dataset_subset_episodes(tmp_path): shutil.rmtree(output_dir) +@require_libsvtav1 +@require_hevc +def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_factory): + """Depth image features convert to depth videos using the depth encoder. + + Mirrors :func:`test_convert_image_to_video_dataset` but with a small local + image dataset that mixes an RGB camera with a depth camera, so the + ``depth_keys`` → ``depth_encoder`` routing and ``is_depth_map`` preservation + are exercised end-to-end. + """ + features = { + "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}, + "observation.images.cam": { + "dtype": "image", + "shape": (64, 96, 3), + "names": ["height", "width", "channels"], + }, + "observation.images.depth": { + "dtype": "image", + "shape": (64, 96, 1), + "names": ["height", "width", "channels"], + "info": {"is_depth_map": True}, + }, + } + source_dataset = empty_lerobot_dataset_factory( + root=tmp_path / "img_ds", + features=features, + use_videos=False, + ) + + add_frames(source_dataset, num_frames=4) + source_dataset.save_episode() + source_dataset.finalize() + + # Source is an image dataset with the depth marker on the depth camera. + assert len(source_dataset.meta.video_keys) == 0 + assert "observation.images.depth" in source_dataset.meta.depth_keys + + output_dir = tmp_path / "video_ds" + with ( + patch("lerobot.datasets.dataset_metadata.get_safe_version") as mock_get_safe_version, + patch("lerobot.datasets.dataset_metadata.snapshot_download") as mock_snapshot_download, + ): + mock_get_safe_version.return_value = "v3.0" + mock_snapshot_download.return_value = str(output_dir) + + # Use non-default quantization params so the persisted metadata must + # come from the depth encoder (not RGB encoder defaults). + depth_encoder = DepthEncoderConfig( + vcodec="hevc", + pix_fmt="gray12le", + g=2, + crf=30, + depth_min=0.05, + depth_max=8.0, + shift=2.0, + use_log=False, + ) + video_dataset = convert_image_to_video_dataset( + dataset=source_dataset, + output_dir=output_dir, + repo_id="dummy/depth_video", + rgb_encoder=RGBEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + depth_encoder=depth_encoder, + num_workers=1, + ) + + # Both cameras are now videos, and the depth marker survived the conversion. + assert "observation.images.cam" in video_dataset.meta.video_keys + assert "observation.images.depth" in video_dataset.meta.video_keys + assert "observation.images.depth" in video_dataset.meta.depth_keys + assert "observation.images.cam" not in video_dataset.meta.depth_keys + + depth_path = video_dataset.root / video_dataset.meta.get_video_file_path(0, "observation.images.depth") + assert depth_path.exists(), f"Depth video file should exist: {depth_path}" + + # The persisted depth-video metadata must carry the depth quantization params + # from the depth encoder (so frames dequantize correctly on read), and the RGB + # camera must not be marked as a depth map. + persisted_info = load_info(video_dataset.root) + depth_info = persisted_info.features["observation.images.depth"]["info"] + assert depth_info["is_depth_map"] is True + assert DepthEncoderConfig.from_video_info(depth_info) == depth_encoder + + cam_info = persisted_info.features["observation.images.cam"]["info"] + assert cam_info.get("is_depth_map") is False + assert "video.codec" in cam_info + + # ─── reencode_dataset ───────────────────────────────────────────────── +@require_hevc +def test_reencode_dataset_depth_uses_depth_encoder(tmp_path, empty_lerobot_dataset_factory): + """Depth videos are re-encoded with the depth encoder and keep their depth metadata. + + Depth-focused companion to :func:`test_reencode_dataset_multi_key_multiprocessing`. + """ + initial_cfg = DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=2, crf=30) + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "ds", + features=DUMMY_DEPTH_FEATURES, + use_videos=True, + depth_encoder=initial_cfg, + ) + + add_frames(dataset, num_frames=4) + dataset.save_episode() + dataset.finalize() + + assert DUMMY_DEPTH_KEY in dataset.meta.depth_keys + + target_cfg = DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=6, crf=23) + result = reencode_dataset(dataset, depth_encoder=target_cfg, num_workers=0) + + assert result is dataset + + persisted_info = load_info(dataset.root) + depth_info = persisted_info.features[DUMMY_DEPTH_KEY].get("info", {}) + # Re-encode applied the new codec parameters to the depth video ... + assert DepthEncoderConfig.from_video_info(depth_info) == target_cfg + # ... while preserving the depth marker. + assert depth_info["is_depth_map"] is True + + @require_libsvtav1 @require_h264 def test_reencode_dataset_multi_key_multiprocessing( @@ -1342,29 +1466,29 @@ def test_reencode_dataset_multi_key_multiprocessing( ): """Re-encode a two-camera dataset with num_workers=2 and verify metadata refresh.""" features = features_factory(use_videos=True) - initial_cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + initial_cfg = RGBEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) dataset = empty_lerobot_dataset_factory( root=tmp_path / "ds", features=features, use_videos=True, - camera_encoder=initial_cfg, + rgb_encoder=initial_cfg, ) - _add_frames(dataset, num_frames=4) + add_frames(dataset, num_frames=4) dataset.save_episode() - _add_frames(dataset, num_frames=4) + add_frames(dataset, num_frames=4) dataset.save_episode() dataset.finalize() assert len(dataset.meta.video_keys) == 2 - target_cfg = VideoEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv420p") + target_cfg = RGBEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv420p") - result = reencode_dataset(dataset, camera_encoder=target_cfg, num_workers=2) + result = reencode_dataset(dataset, rgb_encoder=target_cfg, num_workers=2) assert result is dataset persisted_info = load_info(dataset.root) for vk in dataset.meta.video_keys: - persisted_encoder = VideoEncoderConfig.from_video_info(persisted_info.features[vk].get("info", {})) + persisted_encoder = RGBEncoderConfig.from_video_info(persisted_info.features[vk].get("info", {})) assert persisted_encoder == target_cfg diff --git a/tests/datasets/test_dataset_writer.py b/tests/datasets/test_dataset_writer.py index 8670aeebc..17785ad74 100644 --- a/tests/datasets/test_dataset_writer.py +++ b/tests/datasets/test_dataset_writer.py @@ -53,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict: # ── Existing encode_video_worker tests ─────────────────────────────── -def test_encode_video_worker_forwards_camera_encoder(tmp_path): - """_encode_video_worker forwards camera_encoder to encode_video_frames.""" +def test_encode_video_worker_forwards_video_encoder(tmp_path): + """_encode_video_worker forwards video_encoder to encode_video_frames.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -74,16 +74,16 @@ def test_encode_video_worker_forwards_camera_encoder(tmp_path): 0, tmp_path, fps=30, - camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None), + video_encoder=VideoEncoderConfig(vcodec="h264", preset=None), encoder_threads=4, ) - assert captured_kwargs["camera_encoder"].vcodec == "h264" + assert captured_kwargs["video_encoder"].vcodec == "h264" assert captured_kwargs["encoder_threads"] == 4 -def test_encode_video_worker_default_camera_encoder(tmp_path): - """_encode_video_worker passes None camera_encoder which encode_video_frames defaults.""" +def test_encode_video_worker_default_video_encoder(tmp_path): + """_encode_video_worker passes None video_encoder which encode_video_frames defaults.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -100,7 +100,7 @@ def test_encode_video_worker_default_camera_encoder(tmp_path): with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode): _encode_video_worker(video_key, 0, tmp_path, fps=30) - assert captured_kwargs["camera_encoder"] is None + assert captured_kwargs["video_encoder"] is None assert captured_kwargs["encoder_threads"] is None diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 1d2fb1d55..225479814 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -1534,6 +1534,10 @@ def test_valid_video_codecs_constant(): assert "auto" in VALID_VIDEO_CODECS assert "h264_videotoolbox" in VALID_VIDEO_CODECS assert "h264_nvenc" in VALID_VIDEO_CODECS + assert "h264_vaapi" in VALID_VIDEO_CODECS + assert "h264_qsv" in VALID_VIDEO_CODECS + assert "hevc_videotoolbox" in VALID_VIDEO_CODECS + assert "hevc_nvenc" in VALID_VIDEO_CODECS assert len(VALID_VIDEO_CODECS) == 10 diff --git a/tests/datasets/test_depth.py b/tests/datasets/test_depth.py new file mode 100644 index 000000000..a075fa6b5 --- /dev/null +++ b/tests/datasets/test_depth.py @@ -0,0 +1,247 @@ +"""Tests for the depth-integration feature. + +Covers: +- ``depth_utils`` quantize/dequantize round-trips and backend agreement. +- Image-writer support for single-channel depth. +- Hardware-feature → depth flag routing. +- Feature-to-file-format routing through the dataset writer. + +Depth metadata detection on ``LeRobotDatasetMetadata.depth_keys`` lives in +``test_dataset_metadata.py``. Depth video encoding/decoding lives in +``test_video_encoding.py``. +""" + +from pathlib import Path + +import pytest + +pytest.importorskip("av", reason="av is required (install lerobot[dataset])") + +import av +import numpy as np +import PIL.Image +import torch + +from lerobot.configs import DepthEncoderConfig +from lerobot.configs.video import ( + DEFAULT_DEPTH_MAX, + DEFAULT_DEPTH_MIN, + DEPTH_METER_UNIT, + DEPTH_MILLIMETER_UNIT, + DEPTH_QMAX, +) +from lerobot.datasets.depth_utils import dequantize_depth, quantize_depth +from lerobot.datasets.image_writer import image_array_to_pil_image, write_image +from tests.fixtures.constants import ( + DEFAULT_FPS, + DUMMY_CAMERA_FEATURES, + DUMMY_CAMERA_FEATURES_WITH_DEPTH, + DUMMY_CHW, + DUMMY_DEPTH_CAMERA_FEATURES, + DUMMY_REPO_ID, +) +from tests.fixtures.dataset_factories import add_frames + +_, H, W = DUMMY_CHW + + +def _depth_metres_ramp() -> np.ndarray: + """Linearly-spaced float32 depth in metres covering the default range.""" + return np.linspace(DEFAULT_DEPTH_MIN, DEFAULT_DEPTH_MAX, H * W, dtype=np.float32).reshape(H, W) + + +# ── 1. Quantize / dequantize round-trips ────────────────────────────── + + +class TestQuantizeDequantize: + """Numerical contract of ``quantize_depth`` / ``dequantize_depth``.""" + + @pytest.mark.parametrize("use_log", [False, True]) + @pytest.mark.parametrize("output_unit", [DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]) + @pytest.mark.parametrize("output_channel_last", [False, True]) + def test_roundtrip(self, use_log, output_unit, output_channel_last): + """quantize → dequantize recovers depth; layout and unit are honored.""" + depth = _depth_metres_ramp() + quantized = quantize_depth(depth, use_log=use_log, video_backend=None) + recovered = dequantize_depth( + quantized, + use_log=use_log, + output_unit=output_unit, + output_tensor=False, + output_channel_last=output_channel_last, + ) + + expected_shape = (H, W, 1) if output_channel_last else (1, H, W) + assert recovered.shape == expected_shape + + recovered_m = recovered.astype(np.float32) + if output_unit == DEPTH_MILLIMETER_UNIT: + recovered_m = recovered_m / 1000.0 + recovered_2d = recovered_m[..., 0] if output_channel_last else recovered_m[0] + + if use_log: + # Log mode: tighter near-range error than far-range (the whole point). + near = depth < 1.0 + far = depth > 8.0 + err_near = np.abs(recovered_2d[near] - depth[near]) + err_far = np.abs(recovered_2d[far] - depth[far]) + assert err_near.mean() < err_far.mean() + else: + # Linear mode: bounded by quant step + 1 mm of unit-conversion rounding. + tol = (DEFAULT_DEPTH_MAX - DEFAULT_DEPTH_MIN) / DEPTH_QMAX + 1e-3 + np.testing.assert_allclose(recovered_2d, depth, atol=tol) + + @pytest.mark.parametrize("use_log", [False, True]) + @pytest.mark.parametrize("output_unit", [DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]) + def test_numpy_torch_agree(self, use_log, output_unit): + """Batched torch path produces the same values as the numpy path.""" + batch_size = 3 + per_frame = np.linspace(0, DEPTH_QMAX, H * W, dtype=np.uint16).reshape(H, W) + batch_np = np.broadcast_to(per_frame[None, None, ...], (batch_size, 1, H, W)).copy() + batch_t = torch.from_numpy(batch_np.astype(np.int32)) # torch.uint16 support is patchy. + + ref = dequantize_depth(batch_np, use_log=use_log, output_unit=output_unit, output_tensor=False) + out = dequantize_depth(batch_t, use_log=use_log, output_unit=output_unit, output_tensor=True) + + assert isinstance(out, torch.Tensor) + assert out.shape == (batch_size, 1, H, W) + # ``m``: float32 noise (~10 µm in log mode, after ``exp``) — still 200× below the ~2 mm quant step. + # ``mm`` + tensor stays in float32 (no uint16 round-trip), so allow 1 mm slop. + atol = 1e-5 if output_unit == DEPTH_METER_UNIT else 1.0 + np.testing.assert_allclose(out.cpu().numpy().astype(np.float64), ref.astype(np.float64), atol=atol) + + @pytest.mark.parametrize( + "input_shape,output_shape", + [ + ((H, W), (1, H, W)), + ((1, H, W), (1, H, W)), + ((H, W, 1), (1, H, W)), + ((3, 1, H, W), (3, 1, H, W)), + ((3, H, W, 1), (3, 1, H, W)), + ], + ) + def test_input_layouts_accepted(self, input_shape, output_shape): + """All documented input layouts decode to the channel-first default.""" + quantized = np.full(input_shape, DEPTH_QMAX // 2, dtype=np.uint16) + out = dequantize_depth(quantized, output_unit=DEPTH_METER_UNIT, output_tensor=False) + assert out.shape == output_shape + + def test_pyav_frame_roundtrip(self): + """quantize → av.VideoFrame → dequantize works.""" + depth = _depth_metres_ramp() + frame = quantize_depth(depth, use_log=False, video_backend="pyav") + assert isinstance(frame, av.VideoFrame) + + recovered = dequantize_depth(frame, use_log=False, output_unit=DEPTH_METER_UNIT, output_tensor=False) + assert recovered.shape == (1, H, W) + tol = (DEFAULT_DEPTH_MAX - DEFAULT_DEPTH_MIN) / DEPTH_QMAX + 1e-3 + np.testing.assert_allclose(recovered[0], depth, atol=tol) + + def test_invalid_log_params_raises(self): + with pytest.raises(ValueError, match=r"depth_min \+ shift must be positive"): + quantize_depth(_depth_metres_ramp(), depth_min=1.0, shift=-2.0, use_log=True, video_backend=None) + + +# ── 2. Image writer depth support ───────────────────────────────────── + + +class TestImageWriterDepth: + """``image_array_to_pil_image`` and ``write_image`` for depth maps.""" + + @pytest.mark.parametrize("dtype,expected_mode", [(np.uint16, "I;16"), (np.float32, "F")]) + @pytest.mark.parametrize("shape", [(H, W), (H, W, 1), (1, H, W)]) + def test_pil_depth_modes_and_squeeze(self, dtype, expected_mode, shape): + """Single-channel depth converts to PIL with the right mode and (W, H) size.""" + arr = np.zeros(shape, dtype=dtype) + img = image_array_to_pil_image(arr) + assert img.mode == expected_mode + assert img.size == (W, H) + + def test_write_image_tiff_roundtrip(self, tmp_path): + """uint16 depth round-trips through .tiff.""" + arr = np.arange(H * W, dtype=np.uint16).reshape(H, W) + fpath = tmp_path / "depth.tiff" + write_image(arr, fpath) + with PIL.Image.open(fpath) as loaded: + recovered = np.array(loaded) + np.testing.assert_array_equal(recovered, arr) + + +# ── 3. Hardware-feature → depth flag ────────────────────────────────── + + +class TestHwToDatasetFeaturesDepth: + """``hw_to_dataset_features`` flags single-channel cameras as depth.""" + + @pytest.mark.parametrize("channels,is_depth", [(1, True), (3, False)]) + def test_depth_marker_by_channels(self, channels, is_depth): + from lerobot.utils.feature_utils import hw_to_dataset_features + + features = hw_to_dataset_features({"cam": (480, 640, channels)}, prefix="observation") + assert features["observation.images.cam"]["info"]["is_depth_map"] is is_depth + + def test_invalid_channel_count_raises(self): + from lerobot.utils.feature_utils import hw_to_dataset_features + + with pytest.raises(ValueError, match="Expected a 3-tuple"): + hw_to_dataset_features({"cam": (480, 640, 2)}, prefix="observation") + + +# ── 4. Feature-to-file-format routing ──────────────────────────────── + + +# Keys derived from DUMMY_CAMERA_FEATURES_WITH_DEPTH; pick one RGB and the depth camera. +RGB_KEY = next(iter(DUMMY_CAMERA_FEATURES)) +DEPTH_KEY = next(iter(DUMMY_DEPTH_CAMERA_FEATURES)) + + +class TestFeatureFileRouting: + """Depth vs RGB features route to the correct file format.""" + + NUM_FRAMES = 5 + + def test_image_mode_depth_tiff_rgb_png(self, tmp_path, features_factory): + """Without video encoding: depth → .tiff, RGB → .png.""" + from lerobot.datasets.lerobot_dataset import LeRobotDataset + + features = features_factory(camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=False) + dataset = LeRobotDataset.create( + repo_id=DUMMY_REPO_ID, + fps=DEFAULT_FPS, + features=features, + root=tmp_path / "ds", + use_videos=False, + ) + + add_frames(dataset, num_frames=self.NUM_FRAMES) + + buf = dataset.writer.episode_buffer + assert all(Path(p).suffix == ".tiff" for p in buf[DEPTH_KEY]) + assert all(Path(p).suffix == ".png" for p in buf[RGB_KEY]) + + dataset.save_episode() + dataset.finalize() + + def test_video_mode_depth_uses_depth_encoder(self, tmp_path, features_factory): + """With streaming video encoding: depth → DepthEncoderConfig, RGB does not.""" + from lerobot.datasets.lerobot_dataset import LeRobotDataset + + features = features_factory(camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=True) + dataset = LeRobotDataset.create( + repo_id=DUMMY_REPO_ID, + fps=DEFAULT_FPS, + features=features, + root=tmp_path / "ds", + use_videos=True, + streaming_encoding=True, + ) + + add_frames(dataset, num_frames=self.NUM_FRAMES) + + encoder = dataset.writer._streaming_encoder + assert encoder is not None + assert isinstance(encoder._threads[DEPTH_KEY].video_encoder, DepthEncoderConfig) + assert not isinstance(encoder._threads[RGB_KEY].video_encoder, DepthEncoderConfig) + + dataset.save_episode() + dataset.finalize() diff --git a/tests/datasets/test_image_writer.py b/tests/datasets/test_image_writer.py index 916b8f017..1cf2cf75c 100644 --- a/tests/datasets/test_image_writer.py +++ b/tests/datasets/test_image_writer.py @@ -94,7 +94,7 @@ def test_image_array_to_pil_image_pytorch_format(img_array_factory): def test_image_array_to_pil_image_single_channel(img_array_factory): img_array = img_array_factory(channels=1) - with pytest.raises(NotImplementedError): + with pytest.raises(ValueError, match="Unsupported single-channel image dtype"): image_array_to_pil_image(img_array) @@ -344,7 +344,7 @@ def test_with_different_image_formats(tmp_path, img_array_factory): writer = AsyncImageWriter() try: image_array = img_array_factory() - formats = ["png", "jpeg", "bmp"] + formats = ["png", "tiff", "tif"] for fmt in formats: fpath = tmp_path / f"test_image.{fmt}" write_image(image_array, fpath) diff --git a/tests/datasets/test_streaming_video_encoder.py b/tests/datasets/test_streaming_video_encoder.py index b69f24254..1ffad6854 100644 --- a/tests/datasets/test_streaming_video_encoder.py +++ b/tests/datasets/test_streaming_video_encoder.py @@ -26,7 +26,7 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])") import av # noqa: E402 -from lerobot.configs import VideoEncoderConfig +from lerobot.configs import RGBEncoderConfig from lerobot.datasets.pyav_utils import get_codec from lerobot.datasets.video_utils import ( StreamingVideoEncoder, @@ -57,13 +57,11 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() - enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) + enc_cfg = RGBEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec=enc_cfg.vcodec, - pix_fmt=enc_cfg.pix_fmt, - codec_options=enc_cfg.get_codec_options(as_strings=True), + video_encoder=enc_cfg, frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -108,13 +106,11 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() - enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) + enc_cfg = RGBEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec=enc_cfg.vcodec, - pix_fmt=enc_cfg.pix_fmt, - codec_options=enc_cfg.get_codec_options(as_strings=True), + video_encoder=enc_cfg, frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -142,13 +138,11 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() - enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) + enc_cfg = RGBEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec=enc_cfg.vcodec, - pix_fmt=enc_cfg.pix_fmt, - codec_options=enc_cfg.get_codec_options(as_strings=True), + video_encoder=enc_cfg, frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -171,15 +165,15 @@ class TestCameraEncoderThread: class TestStreamingVideoEncoder: def _make_encoder_config(self, **kwargs): - """Helper to build a VideoEncoderConfig.""" - return VideoEncoderConfig(**kwargs) + """Helper to build an RGBEncoderConfig.""" + return RGBEncoderConfig(**kwargs) def test_single_camera_episode(self, tmp_path): """Test encoding a single camera episode.""" video_keys = [f"{OBS_IMAGES}.laptop"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config( + rgb_encoder=self._make_encoder_config( vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 ), ) @@ -211,7 +205,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + rgb_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), ) encoder.start_episode(video_keys, tmp_path) @@ -237,7 +231,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.cam"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + rgb_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), ) for ep in range(3): @@ -263,7 +257,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.cam"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + rgb_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), ) encoder.start_episode(video_keys, tmp_path) @@ -309,7 +303,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.cam"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config( + rgb_encoder=self._make_encoder_config( vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 ), ) @@ -346,7 +340,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + rgb_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), ) encoder.start_episode(video_keys, tmp_path) @@ -375,7 +369,7 @@ class TestStreamingVideoEncoder: def test_encoder_threads_passed_to_thread(self, tmp_path): """Test that encoder_threads is stored and passed through to encoder threads.""" video_keys = [f"{OBS_IMAGES}.cam"] - cfg = VideoEncoderConfig( + cfg = RGBEncoderConfig( vcodec="libsvtav1", pix_fmt="yuv420p", g=2, @@ -383,7 +377,7 @@ class TestStreamingVideoEncoder: ) encoder = StreamingVideoEncoder( fps=30, - camera_encoder=cfg, + rgb_encoder=cfg, encoder_threads=2, ) assert encoder._encoder_threads == 2 @@ -391,7 +385,8 @@ class TestStreamingVideoEncoder: # Verify codec options include thread tuning for libsvtav1 (lp=…) thread = encoder._threads[f"{OBS_IMAGES}.cam"] - assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options + codec_opts = thread.video_encoder.get_codec_options(encoder_threads=thread.encoder_threads) + assert "svtav1-params" in codec_opts or "threads" in codec_opts # Feed some frames and finish to ensure it works end-to-end num_frames = 10 @@ -422,7 +417,7 @@ class TestStreamingVideoEncoder: video_keys = [f"{OBS_IMAGES}.cam"] encoder = StreamingVideoEncoder( fps=30, - camera_encoder=self._make_encoder_config( + rgb_encoder=self._make_encoder_config( vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 ), queue_maxsize=1, diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py index 2a35f3210..80819d665 100644 --- a/tests/datasets/test_video_encoding.py +++ b/tests/datasets/test_video_encoding.py @@ -26,7 +26,7 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])") import av # noqa: E402 -from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig +from lerobot.configs import VALID_VIDEO_CODECS, DepthEncoderConfig, RGBEncoderConfig, VideoEncoderConfig from lerobot.datasets.image_writer import write_image from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.pyav_utils import get_codec @@ -37,7 +37,15 @@ from lerobot.datasets.video_utils import ( get_video_info, reencode_video, ) -from tests.fixtures.constants import DUMMY_VIDEO_INFO +from tests.fixtures.constants import ( + DUMMY_DEPTH_FEATURES, + DUMMY_DEPTH_KEY, + DUMMY_DEPTH_VIDEO_INFO_FULL, + DUMMY_VIDEO_FEATURES, + DUMMY_VIDEO_INFO, + DUMMY_VIDEO_KEY, +) +from tests.fixtures.dataset_factories import add_frames # Per-codec skip markers — validation tests only fire when the codec is available @@ -48,19 +56,74 @@ def _require_encoder(vcodec: str) -> pytest.MarkDecorator: require_libsvtav1 = _require_encoder("libsvtav1") require_h264 = _require_encoder("h264") +require_hevc = _require_encoder("hevc") require_videotoolbox = _require_encoder("h264_videotoolbox") require_nvenc = _require_encoder("h264_nvenc") require_vaapi = _require_encoder("h264_vaapi") require_qsv = _require_encoder("h264_qsv") -# ─── VideoEncoderConfig / codec options ────────────────────────────── +TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos" + + +def _write_color_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None: + imgs_dir.mkdir(parents=True, exist_ok=True) + for i in range(num_frames): + arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) + write_image(arr, imgs_dir / f"frame-{i:06d}.png") + + +def _write_depth_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None: + """Write synthetic uint16 depth TIFFs (millimetres) for depth encoder tests. + + Uses a smooth linear ramp + per-frame offset (not white noise) so HEVC Main 12 + on ``gray12le`` compresses well. Values span ~100 mm to 10 m, covering most + of the default ``[DEPTH_MIN, DEPTH_MAX]`` metres range after + ``quantize_depth(input_unit="auto"="mm")``. + """ + imgs_dir.mkdir(parents=True, exist_ok=True) + base = np.linspace(100.0, 10_000.0, height * width, dtype=np.float32).reshape(height, width) + for i in range(num_frames): + arr = (base + 50.0 * i).clip(0, 65535).astype(np.uint16) + write_image(arr, imgs_dir / f"frame-{i:06d}.tiff") + + +def _encode_video( + path: Path, + num_frames: int = 4, + fps: int = 30, + cfg: VideoEncoderConfig | None = None, + depth: bool = False, +) -> Path: + """Write synthetic frames to a temp dir and encode them to ``path``. + + ``depth=False`` writes uint8 RGB PNG noise and encodes with ``cfg`` + (defaulting to the library default). ``depth=True`` writes synthetic uint16 + depth TIFFs and encodes with ``cfg`` or a default :class:`DepthEncoderConfig` + (HEVC Main 12 / ``gray12le``). + """ + imgs_dir = path.parent / f"imgs_{path.stem}" + if depth: + _write_depth_frames(imgs_dir, num_frames=num_frames) + cfg = cfg or DepthEncoderConfig() + else: + _write_color_frames(imgs_dir, num_frames=num_frames) + encode_video_frames(imgs_dir, path, fps=fps, video_encoder=cfg, overwrite=True) + return path + + +def _read_feature_info(dataset: LeRobotDataset, key: str = DUMMY_VIDEO_KEY) -> dict: + info = json.loads((dataset.root / INFO_PATH).read_text()) + return info["features"][key]["info"] + + +# ─── RGBEncoderConfig / codec options ────────────────────────────── class TestCodecOptions: @require_libsvtav1 def test_libsvtav1_defaults(self): - cfg = VideoEncoderConfig() + cfg = RGBEncoderConfig() opts = cfg.get_codec_options() assert opts["g"] == 2 assert opts["crf"] == 30 @@ -68,12 +131,12 @@ class TestCodecOptions: @require_libsvtav1 def test_libsvtav1_custom_preset(self): - cfg = VideoEncoderConfig(preset=8) + cfg = RGBEncoderConfig(preset=8) assert cfg.get_codec_options()["preset"] == 8 @require_h264 def test_h264_options(self): - cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None) + cfg = RGBEncoderConfig(vcodec="h264", g=10, crf=23, preset=None) opts = cfg.get_codec_options() assert opts["g"] == 10 assert opts["crf"] == 23 @@ -81,120 +144,120 @@ class TestCodecOptions: @require_videotoolbox def test_videotoolbox_options(self): - cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None) + cfg = RGBEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None) opts = cfg.get_codec_options() assert opts["g"] == 2 assert opts["q:v"] == 40 assert "crf" not in opts - @_require_encoder("h264_nvenc") + @require_nvenc def test_nvenc_options(self): - cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None) + cfg = RGBEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None) opts = cfg.get_codec_options() assert opts["rc"] == 0 assert opts["qp"] == 25 assert "crf" not in opts assert opts["g"] == 2 - @_require_encoder("h264_vaapi") + @require_vaapi def test_vaapi_options(self): - cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None) + cfg = RGBEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None) assert cfg.get_codec_options()["qp"] == 28 - @_require_encoder("h264_qsv") + @require_qsv def test_qsv_options(self): - cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None) + cfg = RGBEncoderConfig(vcodec="h264_qsv", crf=25, preset=None) assert cfg.get_codec_options()["global_quality"] == 25 @require_h264 def test_no_g_no_crf(self): - cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None) + cfg = RGBEncoderConfig(vcodec="h264", g=None, crf=None, preset=None) opts = cfg.get_codec_options() assert "g" not in opts assert "crf" not in opts @require_libsvtav1 def test_encoder_threads_libsvtav1(self): - cfg = VideoEncoderConfig(fast_decode=0) + cfg = RGBEncoderConfig(fast_decode=0) opts = cfg.get_codec_options(encoder_threads=4) assert "lp=4" in opts.get("svtav1-params", "") @require_h264 def test_encoder_threads_h264(self): - cfg = VideoEncoderConfig(vcodec="h264", preset=None) + cfg = RGBEncoderConfig(vcodec="h264", preset=None) assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2 @require_libsvtav1 def test_fast_decode_libsvtav1(self): - cfg = VideoEncoderConfig(fast_decode=1) + cfg = RGBEncoderConfig(fast_decode=1) opts = cfg.get_codec_options() assert "fast-decode=1" in opts.get("svtav1-params", "") @require_libsvtav1 def test_libsvtav1_fast_decode_clamped_to_svt_range(self): """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode).""" - cfg = VideoEncoderConfig(fast_decode=100) + cfg = RGBEncoderConfig(fast_decode=100) assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "") - cfg_neg = VideoEncoderConfig(fast_decode=-5) + cfg_neg = RGBEncoderConfig(fast_decode=-5) assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "") @require_h264 def test_fast_decode_h264(self): - cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None) + cfg = RGBEncoderConfig(vcodec="h264", fast_decode=1, preset=None) assert cfg.get_codec_options()["tune"] == "fastdecode" @require_libsvtav1 def test_pix_fmt_unsupported_raises(self): """Passing an unsupported pix_fmt is a hard error.""" with pytest.raises(ValueError, match="pix_fmt"): - VideoEncoderConfig(pix_fmt="yuv444p") # libsvtav1 only supports yuv420p variants + RGBEncoderConfig(pix_fmt="yuv444p") # libsvtav1 only supports yuv420p variants @require_libsvtav1 @require_h264 def test_preset_default_behaviour(self): """Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None.""" - assert VideoEncoderConfig().preset == 12 - assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12 - assert VideoEncoderConfig(vcodec="h264").preset is None - assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None + assert RGBEncoderConfig().preset == 12 + assert RGBEncoderConfig(vcodec="libsvtav1").preset == 12 + assert RGBEncoderConfig(vcodec="h264").preset is None + assert RGBEncoderConfig(vcodec="h264", preset=None).preset is None @require_h264 def test_preset_string_on_h264(self): """h264 accepts string presets and forwards them to FFmpeg.""" - cfg = VideoEncoderConfig(vcodec="h264", preset="slow") + cfg = RGBEncoderConfig(vcodec="h264", preset="slow") assert cfg.get_codec_options()["preset"] == "slow" @require_videotoolbox def test_preset_on_videotoolbox_not_set(self): """videotoolbox has no preset option at all.""" - cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow") + cfg = RGBEncoderConfig(vcodec="h264_videotoolbox", preset="slow") assert "preset" not in cfg.get_codec_options() @require_libsvtav1 def test_libsvtav1_preset_out_of_range_raises(self): """libsvtav1 preset must sit in [-2, 13] as exposed by PyAV.""" with pytest.raises(ValueError, match="out of range"): - VideoEncoderConfig(vcodec="libsvtav1", preset=100) + RGBEncoderConfig(vcodec="libsvtav1", preset=100) with pytest.raises(ValueError, match="out of range"): - VideoEncoderConfig(vcodec="libsvtav1", preset=-3) + RGBEncoderConfig(vcodec="libsvtav1", preset=-3) @require_libsvtav1 def test_libsvtav1_crf_out_of_range_raises(self): """libsvtav1 crf must sit in [0, 63].""" with pytest.raises(ValueError, match="crf.*out of range"): - VideoEncoderConfig(vcodec="libsvtav1", crf=64) + RGBEncoderConfig(vcodec="libsvtav1", crf=64) @require_libsvtav1 def test_libsvtav1_crf_rejects_python_float(self): """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation.""" with pytest.raises(ValueError, match="float values are not allowed"): - VideoEncoderConfig(vcodec="libsvtav1", crf=2.5) + RGBEncoderConfig(vcodec="libsvtav1", crf=2.5) @require_libsvtav1 def test_libsvtav1_extra_crf_rejects_fractional_string(self): """INT options reject fractional values even when supplied only via ``extra_options``.""" with pytest.raises(ValueError, match="float values are not allowed"): - VideoEncoderConfig( + RGBEncoderConfig( vcodec="libsvtav1", crf=None, extra_options={"crf": "2.5"}, @@ -203,7 +266,7 @@ class TestCodecOptions: @require_libsvtav1 def test_libsvtav1_extra_crf_rejects_float(self): with pytest.raises(ValueError, match="float values are not allowed"): - VideoEncoderConfig( + RGBEncoderConfig( vcodec="libsvtav1", crf=None, extra_options={"crf": 2.5}, @@ -212,13 +275,13 @@ class TestCodecOptions: @require_h264 def test_h264_crf_accepts_float_and_int(self): """x264 exposes crf as a FLOAT option, so both int and float are accepted.""" - assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23 - assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5 + assert RGBEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23 + assert RGBEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5 @require_libsvtav1 def test_validate_is_rerunnable(self): """After mutating a field, validate() re-checks and surfaces new issues.""" - cfg = VideoEncoderConfig(vcodec="libsvtav1") + cfg = RGBEncoderConfig(vcodec="libsvtav1") cfg.preset = 100 # now out of range with pytest.raises(ValueError, match="out of range"): cfg.validate() @@ -227,58 +290,58 @@ class TestCodecOptions: class TestExtraOptions: @require_libsvtav1 def test_default_is_empty_dict(self): - cfg = VideoEncoderConfig() + cfg = RGBEncoderConfig() assert cfg.extra_options == {} @require_libsvtav1 def test_unknown_key_passes_through(self): """Keys not published as AVOptions are forwarded to FFmpeg.""" - cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"}) + cfg = RGBEncoderConfig(extra_options={"totally_made_up_option": "value"}) assert cfg.extra_options == {"totally_made_up_option": "value"} @require_libsvtav1 def test_numeric_value_in_range_ok(self): """libsvtav1 exposes ``qp`` as INT in [0, 63].""" - cfg = VideoEncoderConfig(extra_options={"qp": 30}) + cfg = RGBEncoderConfig(extra_options={"qp": 30}) assert cfg.extra_options == {"qp": 30} @require_libsvtav1 def test_numeric_out_of_range_raises(self): with pytest.raises(ValueError, match=r"qp=.*out of range"): - VideoEncoderConfig(extra_options={"qp": 999}) + RGBEncoderConfig(extra_options={"qp": 999}) @require_libsvtav1 def test_numeric_string_accepted_in_range(self): """Numeric strings are accepted for numeric options (mirrors FFmpeg).""" - cfg = VideoEncoderConfig(extra_options={"qp": "18"}) + cfg = RGBEncoderConfig(extra_options={"qp": "18"}) assert cfg.extra_options == {"qp": "18"} @require_libsvtav1 def test_numeric_string_out_of_range_raises(self): with pytest.raises(ValueError, match=r"qp=.*out of range"): - VideoEncoderConfig(extra_options={"qp": "999"}) + RGBEncoderConfig(extra_options={"qp": "999"}) @require_libsvtav1 def test_non_numeric_string_on_numeric_option_raises(self): with pytest.raises(ValueError, match=r"qp=.*not numeric"): - VideoEncoderConfig(extra_options={"qp": "medium"}) + RGBEncoderConfig(extra_options={"qp": "medium"}) @require_libsvtav1 def test_bool_on_numeric_option_raises(self): """``bool`` is explicitly rejected for numeric options.""" with pytest.raises(ValueError, match=r"qp=.*not numeric"): - VideoEncoderConfig(extra_options={"qp": True}) + RGBEncoderConfig(extra_options={"qp": True}) @require_h264 def test_string_option_passes_through_unchecked(self): """String-typed AVOptions are NOT enum-checked (too many accept freeform).""" - cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"}) + cfg = RGBEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"}) assert cfg.extra_options == {"tune": "some-future-tune"} @require_libsvtav1 def test_merged_into_codec_options_and_stringified(self): """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict.""" - cfg = VideoEncoderConfig(extra_options={"qp": 20}) + cfg = RGBEncoderConfig(extra_options={"qp": 20}) opts = cfg.get_codec_options() assert opts["qp"] == 20 assert isinstance(opts["qp"], int) @@ -287,25 +350,25 @@ class TestExtraOptions: @require_libsvtav1 def test_structured_fields_win_on_collision(self): """A colliding extra_options key is discarded; the structured field wins.""" - cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18}) + cfg = RGBEncoderConfig(crf=30, extra_options={"crf": 18}) assert cfg.get_codec_options()["crf"] == 30 class TestEncoderDetection: @require_h264 def test_explicit_codec_kept_when_available(self): - cfg = VideoEncoderConfig(vcodec="h264") + cfg = RGBEncoderConfig(vcodec="h264") assert cfg.vcodec == "h264" @require_videotoolbox def test_auto_picks_videotoolbox_when_available(self): """``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present.""" - cfg = VideoEncoderConfig(vcodec="auto") + cfg = RGBEncoderConfig(vcodec="auto") assert cfg.vcodec == "h264_videotoolbox" def test_invalid_codec_raises(self): with pytest.raises(ValueError, match="Invalid vcodec"): - VideoEncoderConfig(vcodec="not_a_real_codec") + RGBEncoderConfig(vcodec="not_a_real_codec") def test_hw_encoder_names_listed_as_valid(self): assert "auto" in VALID_VIDEO_CODECS @@ -313,59 +376,6 @@ class TestEncoderDetection: assert "h264_nvenc" in VALID_VIDEO_CODECS -TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos" - -# Default video feature set used by persistence tests. -VIDEO_FEATURES = { - "observation.images.cam": { - "dtype": "video", - "shape": (64, 96, 3), - "names": ["height", "width", "channels"], - }, - "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}, -} -VIDEO_KEY = "observation.images.cam" - - -def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None: - imgs_dir.mkdir(parents=True, exist_ok=True) - for i in range(num_frames): - arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) - write_image(arr, imgs_dir / f"frame-{i:06d}.png") - - -def _encode_video( - path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None -) -> Path: - imgs_dir = path.parent / f"imgs_{path.stem}" - _write_frames(imgs_dir, num_frames=num_frames) - encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True) - return path - - -def _read_feature_info(dataset: LeRobotDataset) -> dict: - info = json.loads((dataset.root / INFO_PATH).read_text()) - return info["features"][VIDEO_KEY]["info"] - - -def _add_frames(dataset: LeRobotDataset, num_frames: int, video_keys: list[str] | None = None) -> None: - from lerobot.utils.constants import DEFAULT_FEATURES - - if video_keys is None: - video_keys = dataset.meta.video_keys - for _ in range(num_frames): - frame: dict = {"task": "test"} - for key, ft in dataset.meta.features.items(): - if key in DEFAULT_FEATURES: - continue - shape = ft["shape"] - if key in video_keys: - frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8) - else: - frame[key] = np.zeros(shape, dtype=np.float32) - dataset.add_frame(frame) - - class TestGetVideoInfo: def test_returns_all_stream_fields(self): info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4") @@ -375,7 +385,7 @@ class TestGetVideoInfo: assert info["video.pix_fmt"] == "yuv420p" assert info["video.fps"] == 30 assert info["video.channels"] == 3 - assert info["video.is_depth_map"] is False + assert info["is_depth_map"] is False assert info["has_audio"] is False assert "video.g" not in info assert "video.crf" not in info @@ -383,9 +393,9 @@ class TestGetVideoInfo: @require_libsvtav1 def test_merges_encoder_config_as_video_prefixed_entries(self): - cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + cfg = RGBEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) - info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg) + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=cfg) assert info["video.g"] == 2 assert info["video.crf"] == 30 @@ -396,13 +406,18 @@ class TestGetVideoInfo: @require_libsvtav1 def test_stream_derived_keys_take_precedence_over_config(self): - cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p") + cfg = RGBEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p") - info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg) + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=cfg) assert info["video.codec"] # populated from stream, not from config's vcodec assert info["video.pix_fmt"] == "yuv420p" + def test_depth_encoder_config_sets_is_depth_map_true(self): + """A ``DepthEncoderConfig`` causes ``get_video_info`` to mark the stream as depth.""" + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", video_encoder=DepthEncoderConfig()) + assert info["is_depth_map"] is True + class TestEncodeVideoFrames: @require_libsvtav1 @@ -434,7 +449,7 @@ class TestEncodeVideoFrames: def test_overwrite_false_skips_existing_file(self, tmp_path): imgs_dir = tmp_path / "imgs" - _write_frames(imgs_dir) + _write_color_frames(imgs_dir) video_path = tmp_path / "out.mp4" sentinel = b"pre-existing content" video_path.write_bytes(sentinel) @@ -446,7 +461,7 @@ class TestEncodeVideoFrames: @require_libsvtav1 def test_overwrite_true_replaces_existing_file(self, tmp_path): imgs_dir = tmp_path / "imgs" - _write_frames(imgs_dir) + _write_color_frames(imgs_dir) video_path = tmp_path / "out.mp4" video_path.write_bytes(b"stale content") @@ -458,10 +473,10 @@ class TestEncodeVideoFrames: @require_libsvtav1 def test_custom_encoder_config_fields_stored_in_info(self, tmp_path): """All stream-derived and encoder config fields are present after encoding.""" - cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10) + cfg = RGBEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10) video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg) - info = get_video_info(video_path, camera_encoder=cfg) + info = get_video_info(video_path, video_encoder=cfg) # Stream-derived assert info["video.height"] == 64 @@ -470,7 +485,7 @@ class TestEncodeVideoFrames: assert info["video.codec"] == "av1" assert info["video.pix_fmt"] == "yuv420p" assert info["video.fps"] == 30 - assert info["video.is_depth_map"] is False + assert info["is_depth_map"] is False assert info["has_audio"] is False # Encoder config assert info["video.g"] == 4 @@ -487,15 +502,15 @@ class TestReencodeVideo: def test_reencode_video(self, tmp_path): src = TEST_ARTIFACTS_DIR / "clip_4frames.mp4" out = tmp_path / "reencoded.mp4" - cfg = VideoEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv444p") - reencode_video(src, out, camera_encoder=cfg, overwrite=True) + cfg = RGBEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv444p") + reencode_video(src, out, video_encoder=cfg, overwrite=True) assert out.exists() with av.open(str(out)) as container: n_frames = sum(1 for _ in container.decode(video=0)) assert n_frames == 4 - info = get_video_info(out, camera_encoder=cfg) + info = get_video_info(out, video_encoder=cfg) assert info["video.codec"] == "h264" assert info["video.pix_fmt"] == "yuv444p" assert info["video.height"] == 64 @@ -508,8 +523,8 @@ class TestReencodeVideo: def test_reencode_video_trim_window(self, tmp_path): src = TEST_ARTIFACTS_DIR / "clip_6frames.mp4" out = tmp_path / "trim_window.mp4" - cfg = VideoEncoderConfig(vcodec="h264") - reencode_video(src, out, camera_encoder=cfg, start_time_s=0.05, end_time_s=0.12, overwrite=True) + cfg = RGBEncoderConfig(vcodec="h264") + reencode_video(src, out, video_encoder=cfg, start_time_s=0.05, end_time_s=0.12, overwrite=True) with av.open(str(out)) as container: frames = list(container.decode(video=0)) @@ -578,12 +593,12 @@ class TestEncoderConfigPersistence: @require_libsvtav1 def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory): - cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + cfg = RGBEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) dataset = empty_lerobot_dataset_factory( - root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg + root=tmp_path / "ds", features=DUMMY_VIDEO_FEATURES, use_videos=True, rgb_encoder=cfg ) - _add_frames(dataset, num_frames=4) + add_frames(dataset, num_frames=4) dataset.save_episode() dataset.finalize() @@ -601,16 +616,16 @@ class TestEncoderConfigPersistence: @require_libsvtav1 def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory): - cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + cfg = RGBEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) dataset = empty_lerobot_dataset_factory( - root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg + root=tmp_path / "ds", features=DUMMY_VIDEO_FEATURES, use_videos=True, rgb_encoder=cfg ) - _add_frames(dataset, num_frames=4) + add_frames(dataset, num_frames=4) dataset.save_episode() first_info = dict(_read_feature_info(dataset)) - _add_frames(dataset, num_frames=4) + add_frames(dataset, num_frames=4) dataset.save_episode() dataset.finalize() @@ -618,13 +633,13 @@ class TestEncoderConfigPersistence: class TestFromVideoInfo: - """``VideoEncoderConfig.from_video_info`` reconstructs an encoder config + """``RGBEncoderConfig.from_video_info`` reconstructs an encoder config from the ``video.*`` keys persisted in a dataset's ``info.json``. """ @require_libsvtav1 def test_reconstructs_from_dummy_video_info(self): - cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO) + cfg = RGBEncoderConfig.from_video_info(DUMMY_VIDEO_INFO) # Canonical stream codec ``"av1"`` is aliased to the encoder name. assert cfg.vcodec == "libsvtav1" @@ -636,4 +651,220 @@ class TestFromVideoInfo: assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"] # ``{}`` placeholder (typical after a merge with disagreeing sources) # must not leak into the reconstructed config. - assert cfg.extra_options == VideoEncoderConfig().extra_options + assert cfg.extra_options == RGBEncoderConfig().extra_options + + +# ─── Depth-specific encoding tests ──────────────────────────────────── + + +class TestEncodeDepthVideoFrames: + """Depth mirror of :class:`TestEncodeVideoFrames`. + + Exercises ``encode_video_frames`` end-to-end through + :class:`DepthEncoderConfig` (HEVC Main 12 / ``gray12le``) on synthetic + uint16 depth TIFFs. + """ + + @require_hevc + def test_produces_readable_file(self, tmp_path): + video_path = _encode_video(tmp_path / "out.mp4", depth=True) + + assert video_path.exists() + info = get_video_info(video_path, video_encoder=DepthEncoderConfig()) + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.codec"] == "hevc" + assert info["video.pix_fmt"] == "gray12le" + assert info["video.channels"] == 1 + assert info["is_depth_map"] is True + + @require_hevc + def test_frame_count_and_duration_match_input(self, tmp_path): + num_frames = 10 + fps = 30 + video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps, depth=True) + + with av.open(str(video_path)) as container: + stream = container.streams.video[0] + actual_frames = sum(1 for _ in container.decode(stream)) + duration = ( + float(stream.duration * stream.time_base) + if stream.duration is not None + else float(container.duration / av.time_base) + ) + + assert actual_frames == num_frames + assert abs(duration - num_frames / fps) < 0.1 + + def test_overwrite_false_skips_existing_file(self, tmp_path): + """Codec-agnostic: file-system semantics must hold even without an HEVC encoder.""" + imgs_dir = tmp_path / "imgs" + _write_depth_frames(imgs_dir) + video_path = tmp_path / "out.mp4" + sentinel = b"pre-existing depth content" + video_path.write_bytes(sentinel) + + encode_video_frames(imgs_dir, video_path, fps=30, video_encoder=DepthEncoderConfig(), overwrite=False) + + assert video_path.read_bytes() == sentinel + + @require_hevc + def test_overwrite_true_replaces_existing_file(self, tmp_path): + imgs_dir = tmp_path / "imgs" + _write_depth_frames(imgs_dir) + video_path = tmp_path / "out.mp4" + video_path.write_bytes(b"stale content") + + encode_video_frames(imgs_dir, video_path, fps=30, video_encoder=DepthEncoderConfig(), overwrite=True) + + info = get_video_info(video_path, video_encoder=DepthEncoderConfig()) + assert info["video.height"] == 64 + assert info["video.pix_fmt"] == "gray12le" + assert info["is_depth_map"] is True + + @require_hevc + def test_custom_encoder_config_fields_stored_in_info(self, tmp_path): + """All stream-derived and depth-encoder config fields are present after encoding.""" + cfg = DepthEncoderConfig( + vcodec="hevc", + pix_fmt="gray12le", + g=4, + crf=25, + extra_options={}, + depth_min=0.05, + depth_max=8.0, + shift=2.5, + use_log=False, + ) + video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg, depth=True) + + info = get_video_info(video_path, video_encoder=cfg) + + # Stream-derived + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.channels"] == 1 + assert info["video.codec"] == "hevc" + assert info["video.pix_fmt"] == "gray12le" + assert info["video.fps"] == 30 + assert info["is_depth_map"] is True + assert info["has_audio"] is False + # Base encoder config + assert info["video.g"] == 4 + assert info["video.crf"] == 25 + assert info["video.fast_decode"] == 0 + assert info["video.video_backend"] == "pyav" + assert info["video.extra_options"] == {} + # Depth-specific tuning + assert info["video.depth_min"] == 0.05 + assert info["video.depth_max"] == 8.0 + assert info["video.shift"] == 2.5 + assert info["video.use_log"] is False + + +class TestDepthEncoderConfigPersistence: + """Depth mirror of :class:`TestEncoderConfigPersistence`. + + ``DepthEncoderConfig`` must be stored as ``video.`` entries + (including the depth-specific ``depth_min`` / ``depth_max`` / ``shift`` / + ``use_log``) under ``info["features"][]["info"]`` when the + first episode is saved. + """ + + @require_hevc + def test_first_episode_save_persists_depth_encoder_config(self, tmp_path, empty_lerobot_dataset_factory): + cfg = DepthEncoderConfig( + vcodec="hevc", + pix_fmt="gray12le", + g=2, + crf=30, + extra_options={}, + depth_min=0.05, + depth_max=8.0, + shift=2.5, + use_log=False, + ) + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "ds", features=DUMMY_DEPTH_FEATURES, use_videos=True, depth_encoder=cfg + ) + + add_frames(dataset, num_frames=4) + dataset.save_episode() + dataset.finalize() + + info = _read_feature_info(dataset, key=DUMMY_DEPTH_KEY) + + # Stream-derived + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.fps"] == 30 + assert info["video.codec"] == "hevc" + assert info["video.pix_fmt"] == "gray12le" + assert info["is_depth_map"] is True + # Base encoder config + assert info["video.g"] == 2 + assert info["video.crf"] == 30 + assert info["video.fast_decode"] == 0 + assert info["video.video_backend"] == "pyav" + assert info["video.extra_options"] == {} + # Depth-specific tuning + assert info["video.depth_min"] == 0.05 + assert info["video.depth_max"] == 8.0 + assert info["video.shift"] == 2.5 + assert info["video.use_log"] is False + + @require_hevc + def test_second_episode_does_not_overwrite_depth_encoder_fields( + self, tmp_path, empty_lerobot_dataset_factory + ): + cfg = DepthEncoderConfig( + vcodec="hevc", + pix_fmt="gray12le", + g=2, + crf=30, + depth_min=0.05, + depth_max=8.0, + shift=2.5, + use_log=False, + ) + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "ds", features=DUMMY_DEPTH_FEATURES, use_videos=True, depth_encoder=cfg + ) + + add_frames(dataset, num_frames=4) + dataset.save_episode() + first_info = dict(_read_feature_info(dataset, key=DUMMY_DEPTH_KEY)) + + add_frames(dataset, num_frames=4) + dataset.save_episode() + dataset.finalize() + + assert _read_feature_info(dataset, key=DUMMY_DEPTH_KEY) == first_info + + +class TestDepthFromVideoInfo: + """``DepthEncoderConfig.from_video_info`` reconstructs a depth encoder + config from the ``video.*`` keys persisted in a dataset's ``info.json``. + + Depth mirror of :class:`TestFromVideoInfo`. + """ + + @require_hevc + def test_reconstructs_from_dummy_depth_video_info(self): + cfg = DepthEncoderConfig.from_video_info(DUMMY_DEPTH_VIDEO_INFO_FULL) + + # No alias for ``"hevc"``; the canonical stream codec is reused as-is. + assert cfg.vcodec == "hevc" + assert cfg.pix_fmt == DUMMY_DEPTH_VIDEO_INFO_FULL["video.pix_fmt"] + assert cfg.g == DUMMY_DEPTH_VIDEO_INFO_FULL["video.g"] + assert cfg.crf == DUMMY_DEPTH_VIDEO_INFO_FULL["video.crf"] + assert cfg.fast_decode == DUMMY_DEPTH_VIDEO_INFO_FULL["video.fast_decode"] + assert cfg.video_backend == DUMMY_DEPTH_VIDEO_INFO_FULL["video.video_backend"] + # ``{}`` placeholder (typical after a merge with disagreeing sources) + # must not leak into the reconstructed config. + assert cfg.extra_options == DepthEncoderConfig().extra_options + # Depth-specific tuning round-trips through ``info.json``. + assert cfg.depth_min == DUMMY_DEPTH_VIDEO_INFO_FULL["video.depth_min"] + assert cfg.depth_max == DUMMY_DEPTH_VIDEO_INFO_FULL["video.depth_max"] + assert cfg.shift == DUMMY_DEPTH_VIDEO_INFO_FULL["video.shift"] + assert cfg.use_log == DUMMY_DEPTH_VIDEO_INFO_FULL["video.use_log"] diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py index 4d578b503..d6f4f8ae5 100644 --- a/tests/fixtures/constants.py +++ b/tests/fixtures/constants.py @@ -39,12 +39,56 @@ DUMMY_VIDEO_INFO = { "video.crf": 30, "video.preset": 12, "video.fast_decode": 0, - "video.is_depth_map": False, + "is_depth_map": False, "has_audio": False, } DUMMY_CAMERA_FEATURES = { "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO}, "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO}, } +DUMMY_DEPTH_VIDEO_INFO = { + **DUMMY_VIDEO_INFO, + "is_depth_map": True, +} +DUMMY_DEPTH_VIDEO_INFO_FULL = { + **{k: v for k, v in DUMMY_VIDEO_INFO.items() if k != "video.preset"}, + "video.codec": "hevc", + "video.pix_fmt": "gray12le", + "is_depth_map": True, + "video.depth_min": 0.05, + "video.depth_max": 8.0, + "video.shift": 2.5, + "video.use_log": True, +} +DUMMY_DEPTH_CAMERA_FEATURES = { + "laptop_depth": { + "shape": (64, 96, 1), + "names": ["height", "width", "channels"], + "info": DUMMY_DEPTH_VIDEO_INFO, + }, +} +DUMMY_CAMERA_FEATURES_WITH_DEPTH = {**DUMMY_CAMERA_FEATURES, **DUMMY_DEPTH_CAMERA_FEATURES} DUMMY_CHW = (3, 96, 128) DUMMY_HWC = (96, 128, 3) + +# Default video feature set used by video-encoding persistence tests. +DUMMY_VIDEO_FEATURES = { + "observation.images.cam": { + "dtype": "video", + "shape": (64, 96, 3), + "names": ["height", "width", "channels"], + }, + "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}, +} +DUMMY_VIDEO_KEY = "observation.images.cam" + +DUMMY_DEPTH_FEATURES = { + "observation.images.depth": { + "dtype": "video", + "shape": (64, 96, 1), + "names": ["height", "width", "channels"], + "info": {"is_depth_map": True}, + }, + "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}, +} +DUMMY_DEPTH_KEY = "observation.images.depth" diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py index 2f4d41ff8..100922f9c 100644 --- a/tests/fixtures/dataset_factories.py +++ b/tests/fixtures/dataset_factories.py @@ -49,6 +49,39 @@ from tests.fixtures.constants import ( ) +def add_frames(dataset: LeRobotDataset, num_frames: int) -> None: + """Append ``num_frames`` synthetic frames to ``dataset``. + + Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for + keys in ``dataset.meta.depth_keys``, uint8 random noise for video/image keys, + and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, + frame_index, ...) are auto-populated by ``add_frame`` and skipped here. + """ + video_keys = dataset.meta.video_keys + depth_keys = dataset.meta.depth_keys + # Smooth gradient base reused per (H, W) to keep depth frames cheap to + # encode (HEVC Main 12 hates white noise). + _depth_base_cache: dict[tuple[int, int], np.ndarray] = {} + for i in range(num_frames): + frame: dict = {"task": "test"} + for key, ft in dataset.meta.features.items(): + if key in DEFAULT_FEATURES: + continue + shape = ft["shape"] + if key in depth_keys: + h, w, _ = shape + base = _depth_base_cache.setdefault( + (h, w), + np.linspace(100.0, 10_000.0, h * w, dtype=np.float32).reshape(h, w, 1), + ) + frame[key] = (base + 50.0 * i).clip(0, 65535).astype(np.uint16) + elif key in video_keys: + frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8) + else: + frame[key] = np.zeros(shape, dtype=np.float32) + dataset.add_frame(frame) + + class LeRobotDatasetFactory(Protocol): def __call__(self, *args, **kwargs) -> LeRobotDataset: ... @@ -485,10 +518,14 @@ def lerobot_dataset_factory( hf_dataset: datasets.Dataset | None = None, data_files_size_in_mb: float = DEFAULT_DATA_FILE_SIZE_IN_MB, chunks_size: int = DEFAULT_CHUNK_SIZE, + camera_features: dict | None = None, **kwargs, ) -> LeRobotDataset: # Instantiate objects if info is None: + info_kwargs = {} + if camera_features is not None: + info_kwargs["camera_features"] = camera_features info = info_factory( total_episodes=total_episodes, total_frames=total_frames, @@ -496,6 +533,7 @@ def lerobot_dataset_factory( use_videos=use_videos, data_files_size_in_mb=data_files_size_in_mb, chunks_size=chunks_size, + **info_kwargs, ) if stats is None: stats = stats_factory(features=info.features) diff --git a/tests/scripts/test_edit_dataset_parsing.py b/tests/scripts/test_edit_dataset_parsing.py index c90cffb38..22a3c1be2 100644 --- a/tests/scripts/test_edit_dataset_parsing.py +++ b/tests/scripts/test_edit_dataset_parsing.py @@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import ( MergeConfig, ModifyTasksConfig, OperationConfig, + ReencodeVideosConfig, RemoveFeatureConfig, SplitConfig, _validate_config, @@ -103,3 +104,47 @@ class TestOperationTypeParsing: ) resolved_name = OperationConfig.get_choice_name(type(cfg.operation)) assert resolved_name == type_name + + +class TestDepthEncoderParsing: + """Test that the depth encoder is exposed and parsed for video operations.""" + + def test_reencode_has_default_depth_encoder(self): + cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", "reencode_videos"]) + assert isinstance(cfg.operation, ReencodeVideosConfig) + # A depth encoder is configured by default so depth videos are re-encoded too. + assert cfg.operation.depth_encoder is not None + assert hasattr(cfg.operation.depth_encoder, "depth_min") + + def test_reencode_parses_depth_encoder_overrides(self): + cfg = parse_cfg( + [ + "--repo_id", + "test/repo", + "--operation.type", + "reencode_videos", + "--operation.depth_encoder.extra_options", + '{"x265-params": "lossless=1"}', + "--operation.depth_encoder.depth_max", + "12.0", + "--operation.depth_encoder.use_log", + "false", + ] + ) + assert cfg.operation.depth_encoder.extra_options == {"x265-params": "lossless=1"} + assert cfg.operation.depth_encoder.depth_max == 12.0 + assert cfg.operation.depth_encoder.use_log is False + + def test_convert_image_to_video_parses_depth_encoder_overrides(self): + cfg = parse_cfg( + [ + "--repo_id", + "test/repo", + "--operation.type", + "convert_image_to_video", + "--operation.depth_encoder.depth_min", + "0.05", + ] + ) + assert isinstance(cfg.operation, ConvertImageToVideoConfig) + assert cfg.operation.depth_encoder.depth_min == 0.05 diff --git a/tests/utils/test_visualization_utils.py b/tests/utils/test_visualization_utils.py index 63ff76c77..5bd1552db 100644 --- a/tests/utils/test_visualization_utils.py +++ b/tests/utils/test_visualization_utils.py @@ -43,6 +43,11 @@ def mock_rerun(monkeypatch): def __init__(self, arr): self.arr = arr + class DummyDepthImage: + def __init__(self, arr, colormap=None): + self.arr = arr + self.colormap = colormap + def dummy_log(key, obj=None, **kwargs): # Accept either positional `obj` or keyword `entity` and record remaining kwargs. if obj is None and "entity" in kwargs: @@ -55,6 +60,8 @@ def mock_rerun(monkeypatch): __spec__=SimpleNamespace(name="rerun", submodule_search_locations=None), Scalars=DummyScalar, Image=DummyImage, + DepthImage=DummyDepthImage, + components=SimpleNamespace(Colormap=SimpleNamespace(Viridis="viridis")), log=dummy_log, init=lambda *a, **k: None, spawn=lambda *a, **k: None, @@ -225,7 +232,7 @@ def test_log_rerun_data_kwargs_only(mock_rerun): assert temp.value == pytest.approx(10.0) img = _obj_for(calls, "observation.gray") - assert type(img).__name__ == "DummyImage" + assert type(img).__name__ == "DummyDepthImage" # single-channel -> DepthImage assert img.arr.shape == (8, 8, 1) # remains HWC assert _kwargs_for(calls, "observation.gray").get("static", False) is True