From bd9619dfc3fadd3647408537d89e82f83f770851 Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Thu, 14 May 2026 23:46:42 +0200 Subject: [PATCH] feat(encoding parameters): adding support for user provided video encoding parameters (#3455) * chore(video backend): renaming codec into video_backend in get_safe_default_video_backend() * feat(pyav utils): adding suport for PyAV encoding parameters validation * feat(VideoEncoderConfig): creating a VideoEncoderConfig to encapsulate encoding parameters * feat(VideoEncoderConfig): propagating the VideoEncoderConfig in the codebase * chore(docs): updating the docs * feat(metadata): adding encoding parameters in dataset metadata * fix(concatenation compatibility): adding compatibility check when concatenating video files * feat(VideoEncoderConfig init): making VideoEncoderConfig more robust and adaptable to multiple backends * feat(pyav checks): making pyav parameters checks more robust * chore(duplicate): removing duplicate get_codec_options definition * test(existing): adapting existing tests * test(new): adding new tests for encoding related features * chore(format): fixing formatting issues * chore(PyAV): cleaning up PyAV utils and encoding parameters checks to stick to the minimun required tooling. * chore(format): formatting code * chore(doctrings): updating docstrings * fix(camera_encoder_config): Removing camera_encoder_config from LeRobotDataset, as it's only required in LeRobotDatasetWriter. * feat(default values): applying a consistent naming convention for default RGB cameras video encoder parameters * fix(rollout): propagating VideoEncoderConfig to the latest recording modes * chore(format): formatting code, fixing error messages and variable names * fix(arguments order): reverting changes in arguments order in StreamingVideoEncoder * chore(relative imports): switching to relative local imports within lerobot.datasets * test(artifacts): cleaning up artifacts for the video encoding tests * chore(docs): updating docs * chore(fromat): formatting code * fix(imports): refactoring the file architecture to avoid circular imports. VideoEncoderConfig is now defined in lerobot.configs and lazily imports av at runtime. * fix(typos): fixing typos and small mistakes * test(factories): updating factories * feat(aggregate): updating dataset aggregation procedure. Encoding tuning paramters (crf, g,...) are ignored for validation and changed to None in the aggregated dataset if incompatible. * docs(typos): fixing typos * fix(deletion): reverting unwanted deletion * fix(typos): fixing multiple typos * feat(codec options): passing codec options to lerobot_edit_dataset episode deletion tool * typo(typo): typo * fix(typos): fixing remaining typos * chore(rename): renaming camera_encoder_config to camera_encoder * docs(clean): cleaning and formating docs * docs(dataset): addind details about datasets * chore(format): formatting code * docs(warning): adding warning regarding encoding parameters modification * fix(re-encoding): removing inconsistent re-encoding option in lerobot_edit_dataset * typos(typos): typos * chore(format): resolving prettier issues * fix(h264_nvenc): fixing crf handling for h264_nvenc * docs(clean): removing too technical parts of the docs * fix(imports): fixing imports at the __init__ level * fix(imports): fixing not very pretty imports in video config file --- docs/source/_toctree.yml | 2 + docs/source/act.mdx | 2 +- docs/source/earthrover_mini_plus.mdx | 2 +- docs/source/groot.mdx | 2 +- docs/source/hope_jr.mdx | 4 +- docs/source/il_robots.mdx | 2 +- docs/source/lerobot-dataset-v3.mdx | 2 +- docs/source/reachy2.mdx | 4 +- docs/source/smolvla.mdx | 2 +- docs/source/streaming_video_encoding.mdx | 44 +- docs/source/using_dataset_tools.mdx | 14 +- docs/source/video_encoding_parameters.mdx | 117 ++++ src/lerobot/configs/__init__.py | 12 + src/lerobot/configs/dataset.py | 11 +- src/lerobot/configs/default.py | 4 +- src/lerobot/configs/eval.py | 2 +- src/lerobot/configs/rewards.py | 3 +- src/lerobot/configs/train.py | 2 +- src/lerobot/configs/video.py | 235 +++++++ src/lerobot/datasets/__init__.py | 3 + src/lerobot/datasets/aggregate.py | 56 +- src/lerobot/datasets/dataset_metadata.py | 20 +- src/lerobot/datasets/dataset_tools.py | 94 ++- src/lerobot/datasets/dataset_writer.py | 34 +- src/lerobot/datasets/feature_utils.py | 36 ++ src/lerobot/datasets/lerobot_dataset.py | 79 ++- src/lerobot/datasets/pyav_utils.py | 174 +++++ src/lerobot/datasets/video_utils.py | 228 +++---- src/lerobot/policies/eo1/modeling_eo1.py | 5 +- src/lerobot/policies/eo1/processor_eo1.py | 3 +- .../rewards/classifier/modeling_classifier.py | 5 +- .../classifier/processor_classifier.py | 3 +- src/lerobot/rewards/factory.py | 7 +- .../rewards/sarm/compute_rabc_weights.py | 7 +- src/lerobot/rewards/sarm/modeling_sarm.py | 9 +- src/lerobot/rewards/sarm/processor_sarm.py | 9 +- src/lerobot/rollout/context.py | 4 +- src/lerobot/scripts/lerobot_edit_dataset.py | 16 +- src/lerobot/scripts/lerobot_record.py | 31 +- src/lerobot/scripts/lerobot_rollout.py | 12 + src/lerobot/transport/utils.py | 3 +- src/lerobot/utils/import_utils.py | 5 +- tests/artifacts/encoded_videos/clip_32x48.mp4 | 3 + .../artifacts/encoded_videos/clip_4frames.mp4 | 3 + .../artifacts/encoded_videos/clip_5frames.mp4 | 3 + .../artifacts/encoded_videos/clip_6frames.mp4 | 3 + tests/artifacts/encoded_videos/clip_h264.mp4 | 3 + tests/datasets/test_aggregate.py | 76 ++- tests/datasets/test_dataset_reader.py | 6 +- tests/datasets/test_dataset_tools.py | 13 +- tests/datasets/test_dataset_writer.py | 24 +- tests/datasets/test_datasets.py | 14 +- .../datasets/test_streaming_video_encoder.py | 221 +++---- tests/datasets/test_video_encoding.py | 595 ++++++++++++++++++ tests/fixtures/constants.py | 14 +- tests/fixtures/dataset_factories.py | 5 +- 56 files changed, 1765 insertions(+), 527 deletions(-) create mode 100644 docs/source/video_encoding_parameters.mdx create mode 100644 src/lerobot/configs/video.py create mode 100644 src/lerobot/datasets/pyav_utils.py create mode 100644 tests/artifacts/encoded_videos/clip_32x48.mp4 create mode 100644 tests/artifacts/encoded_videos/clip_4frames.mp4 create mode 100644 tests/artifacts/encoded_videos/clip_5frames.mp4 create mode 100644 tests/artifacts/encoded_videos/clip_6frames.mp4 create mode 100644 tests/artifacts/encoded_videos/clip_h264.mp4 create mode 100644 tests/datasets/test_video_encoding.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index de4eeaa28..f1dfe9aae 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -41,6 +41,8 @@ title: Using the Dataset Tools - local: dataset_subtask title: Using Subtasks in the Dataset + - local: video_encoding_parameters + title: Video encoding parameters - local: streaming_video_encoding title: Streaming Video Encoding title: "Datasets" diff --git a/docs/source/act.mdx b/docs/source/act.mdx index 453bcbba8..8e91edcf9 100644 --- a/docs/source/act.mdx +++ b/docs/source/act.mdx @@ -90,6 +90,6 @@ lerobot-record \ --dataset.single_task="Your task description" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --policy.path=${HF_USER}/act_policy ``` diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx index a87bd325b..508c0e3a9 100644 --- a/docs/source/earthrover_mini_plus.mdx +++ b/docs/source/earthrover_mini_plus.mdx @@ -194,7 +194,7 @@ lerobot-record \ --dataset.single_task="Navigate around obstacles" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx index 2f53a4d0b..d69d10a57 100644 --- a/docs/source/groot.mdx +++ b/docs/source/groot.mdx @@ -123,7 +123,7 @@ lerobot-record \ --dataset.single_task="Grab and handover the red cube to the other arm" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --policy.path=/groot-bimanual \ # your trained model --dataset.episode_time_s=30 \ --dataset.reset_time_s=10 diff --git a/docs/source/hope_jr.mdx b/docs/source/hope_jr.mdx index 8826d9758..1f3b08fd7 100644 --- a/docs/source/hope_jr.mdx +++ b/docs/source/hope_jr.mdx @@ -232,7 +232,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --display_data=true ``` @@ -278,6 +278,6 @@ lerobot-record \ --dataset.num_episodes=10 \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model ``` diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index ff0a6229e..07789225a 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -193,7 +193,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx index 8ab4a5d40..6f3e6d948 100644 --- a/docs/source/lerobot-dataset-v3.mdx +++ b/docs/source/lerobot-dataset-v3.mdx @@ -43,7 +43,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx index 1b868711a..4b08569db 100644 --- a/docs/source/reachy2.mdx +++ b/docs/source/reachy2.mdx @@ -161,7 +161,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --display_data=true ``` @@ -203,7 +203,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/smolvla.mdx b/docs/source/smolvla.mdx index bf8a0d2f0..6c63c5d11 100644 --- a/docs/source/smolvla.mdx +++ b/docs/source/smolvla.mdx @@ -108,7 +108,7 @@ lerobot-record \ --dataset.num_episodes=10 \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder.vcodec=auto \ # <- Teleop optional if you want to teleoperate in between episodes \ # --teleop.type=so100_leader \ # --teleop.port=/dev/ttyACM0 \ diff --git a/docs/source/streaming_video_encoding.mdx b/docs/source/streaming_video_encoding.mdx index 40004200e..96e049eb3 100644 --- a/docs/source/streaming_video_encoding.mdx +++ b/docs/source/streaming_video_encoding.mdx @@ -17,9 +17,9 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti | Parameter | CLI Flag | Type | Default | Description | | ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- | | `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture | -| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | +| `vcodec` | `--dataset.camera_encoder.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | | `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide | -| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM | +| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `30` | Max buffered frames per camera (~1s at 30fps). Consumes RAM | ## 3. Performance Considerations @@ -48,7 +48,7 @@ This parameter controls how many threads each encoder instance uses internally: ### Backpressure and Frame Dropping -Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up: +Each camera has a bounded queue (`encoder_queue_maxsize`, default 30 frames). When the encoder can't keep up: 1. The queue fills up (consuming RAM) 2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted @@ -82,15 +82,15 @@ Use HW encoding when: ### Available HW Encoders -| Encoder | Platform | Hardware | CLI Value | -| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ | -| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` | -| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` | -| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` | -| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` | -| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` | -| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` | -| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` | +| Encoder | Platform | Hardware | CLI Value | +| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | --------------------------------------------------- | +| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=h264_videotoolbox` | +| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=hevc_videotoolbox` | +| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=h264_nvenc` | +| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=hevc_nvenc` | +| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder.vcodec=h264_vaapi` | +| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder.vcodec=h264_qsv` | +| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder.vcodec=auto` | > [!NOTE] > In order to use the HW accelerated encoders you might need to upgrade your GPU drivers. @@ -100,15 +100,15 @@ Use HW encoding when: ## 5. Troubleshooting -| Symptom | Likely Cause | Fix | -| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) | -| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). | -| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | -| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | -| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | -| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` | -| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | +| Symptom | Likely Cause | Fix | +| ------------------------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder.vcodec=auto`) | +| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder.vcodec=auto`). | +| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | +| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | +| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | +| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder.vcodec=auto` | +| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | ## 6. Recommended Configurations @@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the # 2camsx 640x480x3 @30fps: Requires some tuning. # Use H.264, disable streaming, consider batching encoding -lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ... +lerobot-record --dataset.camera_encoder.vcodec=h264 --dataset.streaming_encoding=false ... ``` ## 7. Closing note diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index f7fc9be20..49247a6c1 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -117,10 +117,10 @@ lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ - --operation.vcodec libsvtav1 \ - --operation.pix_fmt yuv420p \ - --operation.g 2 \ - --operation.crf 30 + --operation.camera_encoder.vcodec libsvtav1 \ + --operation.camera_encoder.pix_fmt yuv420p \ + --operation.camera_encoder.g 2 \ + --operation.camera_encoder.crf 30 # Convert only specific episodes lerobot-edit-dataset \ @@ -147,11 +147,7 @@ lerobot-edit-dataset \ **Parameters:** - `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`) -- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`) -- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`) -- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2) -- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30) -- `fast_decode`: Fast decode tuning option (default: 0) +- `camera_encoder`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder.. See [Video Encoding Parameters](./video_encoding_parameters) for more details. - `episode_indices`: List of specific episodes to convert (default: all episodes) - `num_workers`: Number of parallel workers for processing (default: 4) diff --git a/docs/source/video_encoding_parameters.mdx b/docs/source/video_encoding_parameters.mdx new file mode 100644 index 000000000..0b5b99b2b --- /dev/null +++ b/docs/source/video_encoding_parameters.mdx @@ -0,0 +1,117 @@ +# Video encoding parameters + +When video storage is enabled, LeRobot stores each camera stream as an **MP4** file instead of saving one image file per timestep. Video encoding compresses across time, which usually cuts dataset size and I/O compared to a pile of PNG, while keeping MP4 — a format every player and loader understands. + +Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `camera_encoder`, a nested `VideoEncoderConfig` (`lerobot.configs.video.VideoEncoderConfig`) passed through PyAV. + +You can set these parameters from the CLI with `--dataset.camera_encoder.` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run. + + + Video storage must be on for `camera_encoder` to have any effect — + `use_videos=True` in Python APIs, or `--dataset.video=true` on the CLI (the + recording default). With video off, inputs stay as images and `camera_encoder` + is ignored. + + +For details on **when** frames are written vs. encoded (streaming vs. post-episode), queues, and other top-level `--dataset.*` switches, see [Streaming Video Encoding](./streaming_video_encoding). For an encoding-parameter comparison and experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark). + +--- + +## Example + +```bash +lerobot-record \ + --robot.type=so100_follower \ + --robot.port=/dev/tty.usbmodem58760431541 \ + --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \ + --robot.id=black \ + --teleop.type=so100_leader \ + --teleop.port=/dev/tty.usbmodem58760431551 \ + --teleop.id=blue \ + --dataset.repo_id=/ \ + --dataset.num_episodes=2 \ + --dataset.single_task="Grab the cube" \ + --dataset.streaming_encoding=true \ + --dataset.encoder_threads=2 \ + --dataset.camera_encoder.vcodec=h264 \ + --dataset.camera_encoder.preset=fast \ + --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \ + --display_data=true +``` + +--- + +## Tuning parameters + + +The defaults are tuned to balance **compression ratio**, **visual quality**, and **decoding/seek speed** for typical robotics datasets. Changing them can affect both recording (CPU load, frame drops) and training (decoding throughput, image quality). + +Only override these parameters if you have a specific reason to, and measure the impact on your pipeline before relying on the new settings. + + + +All flags below are prefixed with `--dataset.camera_encoder.` on the CLI. + +| Parameter | Type | Default | Description | +| --------------- | ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vcodec` | `str` | `"libsvtav1"` | Video codec name. `"auto"` picks the first available hardware encoder from a fixed preference list, falling back to `libsvtav1`. | +| `pix_fmt` | `str` | `"yuv420p"` | Output pixel format. Must be supported by the chosen codec in your FFmpeg build. | +| `g` | `int` | `2` | GOP size — a keyframe every `g` frames. Emitted as FFmpeg option `g`. | +| `crf` | `int` or `float` | `30` | Abstract quality value, mapped per codec (see the [mapping](#mapping-videoencoderconfig--ffmpeg-options) below). Lower → higher quality / larger output where the mapping is monotone. | +| `preset` | `int` or `str` | `12` \* | Encoder speed preset; meaning depends on the codec.
\* When unset and `vcodec=libsvtav1`, LeRobot defaults to `12`. | +| `fast_decode` | `int` | `0` | `libsvtav1`: `0–2`, passed via `svtav1-params`.
`h264` / `hevc` (software): if `>0`, sets `tune=fastdecode`.
Other codecs: usually unused. | +| `video_backend` | `str` | `"pyav"` | Only `"pyav"` is currently implemented for video encoding. | +| `extra_options` | `dict` | `{}` | Extra FFmpeg or codec specific options merged after the structured fields above. Cannot override keys already set by those fields. | + +--- + +## Persistence in dataset metadata + +After the first episode of a video stream is encoded, the encoder configuration is **persisted into the dataset metadata** (`meta/info.json`) under each video feature, alongside the values probed from the file itself. For a video feature `observation.images.`, the layout in `info.json` is: + +```json +{ + "features": { + "observation.images.laptop": { + "dtype": "video", + "shape": [480, 640, 3], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "h264", + "video.pix_fmt": "yuv420p", + "video.fps": 30, + "video.channels": 3, + "video.is_depth_map": false, + "video.g": 2, + "video.crf": 30, + "video.preset": "fast", + "video.fast_decode": 0, + "video.video_backend": "pyav", + "video.extra_options": { "tune": "film", "profile:v": "high", "bf": 2 } + } + } + } +} +``` + +Two sources contribute to the `info` block: + +- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present. +- **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`. + + + This block is populated **once**, from the **first** episode. It assumes every + episode in the dataset was encoded with the same `camera_encoder`. Changing + encoder settings partway through a recording is not supported — the + `info.json` will only reflect the parameters used for the first episode. + + +--- + +## Merging datasets + +When aggregating datasets with `merge_datasets`, video files are concatenated as-is (no re-encoding), and encoder fields in `info.json` are merged per-key: + +- **Stream-derived fields must match** across sources: `video.codec`, `video.pix_fmt`, `video.height`, `video.width`, `video.fps`. Otherwise FFmpeg's concat demuxer fails. +- **Encoder-tuning fields are merged loosely**: `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.extra_options`. If every source agrees, the value is kept; if not, it's set to `null` (or `{}` for `video.extra_options`) and a warning is logged. diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py index ab74c3cd3..c3fe246cd 100644 --- a/src/lerobot/configs/__init__.py +++ b/src/lerobot/configs/__init__.py @@ -31,6 +31,12 @@ from .types import ( PolicyFeature, RTCAttentionSchedule, ) +from .video import ( + VALID_VIDEO_CODECS, + VIDEO_ENCODER_INFO_KEYS, + VideoEncoderConfig, + camera_encoder_defaults, +) __all__ = [ # Types @@ -46,4 +52,10 @@ __all__ = [ "PeftConfig", "PreTrainedConfig", "WandBConfig", + "VideoEncoderConfig", + # Defaults + "camera_encoder_defaults", + # Constants + "VALID_VIDEO_CODECS", + "VIDEO_ENCODER_INFO_KEYS", ] diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py index e3e17e62b..d5c6fa312 100644 --- a/src/lerobot/configs/dataset.py +++ b/src/lerobot/configs/dataset.py @@ -14,10 +14,12 @@ """Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``.""" -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime from pathlib import Path +from .video import VideoEncoderConfig, camera_encoder_defaults + @dataclass class DatasetRecordConfig: @@ -55,10 +57,9 @@ class DatasetRecordConfig: # Number of episodes to record before batch encoding videos # Set to 1 for immediate encoding (default behavior), or higher for batched encoding video_encoding_batch_size: int = 1 - # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto', - # or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'. - # Use 'auto' to auto-detect the best available hardware encoder. - vcodec: str = "libsvtav1" + # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys, + # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``). + camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) # Enable streaming video encoding: encode frames in real-time during capture instead # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding streaming_encoding: bool = False diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index b1eebba94..b809e71d9 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -17,7 +17,7 @@ from dataclasses import dataclass, field from lerobot.transforms import ImageTransformsConfig -from lerobot.utils.import_utils import get_safe_default_codec +from lerobot.utils.import_utils import get_safe_default_video_backend @dataclass @@ -34,7 +34,7 @@ class DatasetConfig: image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig) revision: str | None = None use_imagenet_stats: bool = True - video_backend: str = field(default_factory=get_safe_default_codec) + video_backend: str = field(default_factory=get_safe_default_video_backend) # When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0). # This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion. return_uint8: bool = False diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py index f2a1d3065..c285025ad 100644 --- a/src/lerobot/configs/eval.py +++ b/src/lerobot/configs/eval.py @@ -18,8 +18,8 @@ from logging import getLogger from pathlib import Path from lerobot import envs, policies # noqa: F401 -from lerobot.configs import parser +from . import parser from .default import EvalConfig from .policies import PreTrainedConfig diff --git a/src/lerobot/configs/rewards.py b/src/lerobot/configs/rewards.py index d495160bf..a53d5a417 100644 --- a/src/lerobot/configs/rewards.py +++ b/src/lerobot/configs/rewards.py @@ -27,12 +27,13 @@ from huggingface_hub import hf_hub_download from huggingface_hub.constants import CONFIG_NAME from huggingface_hub.errors import HfHubHTTPError -from lerobot.configs.types import PolicyFeature from lerobot.optim.optimizers import OptimizerConfig from lerobot.optim.schedulers import LRSchedulerConfig from lerobot.utils.device_utils import auto_select_torch_device, is_torch_device_available from lerobot.utils.hub import HubMixin +from .types import PolicyFeature + T = TypeVar("T", bound="RewardModelConfig") logger = logging.getLogger(__name__) diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index c5b4ff5f5..55498d3ac 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -25,11 +25,11 @@ from huggingface_hub import hf_hub_download from huggingface_hub.errors import HfHubHTTPError from lerobot import envs -from lerobot.configs import parser from lerobot.optim import LRSchedulerConfig, OptimizerConfig from lerobot.utils.hub import HubMixin from lerobot.utils.sample_weighting import SampleWeightingConfig +from . import parser from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig from .policies import PreTrainedConfig from .rewards import RewardModelConfig diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py new file mode 100644 index 000000000..bf2471453 --- /dev/null +++ b/src/lerobot/configs/video.py @@ -0,0 +1,235 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Note: We subclass str so that serialization is straightforward +# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json + +"""Video encoder configurations.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any + +from lerobot.utils.import_utils import require_package + +logger = logging.getLogger(__name__) + +# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend. +# Determines the order of preference for auto-selection when vcodec="auto" is used. +HW_VIDEO_CODECS = [ + "h264_videotoolbox", # macOS + "hevc_videotoolbox", # macOS + "h264_nvenc", # NVIDIA GPU + "hevc_nvenc", # NVIDIA GPU + "h264_vaapi", # Linux Intel/AMD + "h264_qsv", # Intel Quick Sync +] +VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS}) +# Aliases for legacy video codec names. +VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"} + + +LIBSVTAV1_DEFAULT_PRESET: int = 12 + +# Keys persisted under ``features[*]["info"]`` as ``video.`` (from :class:`VideoEncoderConfig`). +# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly. +VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset( + {"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"} +) +VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset( + f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES +) + + +@dataclass +class VideoEncoderConfig: + """Video encoder configuration. + + Attributes: + vcodec: Video encoder name. ``"auto"`` is resolved during + construction (HW encoder if available, else ``libsvtav1``). + pix_fmt: Pixel format (e.g. ``"yuv420p"``). + g: GOP size (keyframe interval). + crf: Quality level — mapped to the native quality parameter of the + codec (``crf`` for software, ``qp`` for NVENC/VAAPI, + ``q:v`` for VideoToolbox, ``global_quality`` for QSV). + preset: Speed/quality preset. Accepted type is per-codec. + fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2) + embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values + set ``tune=fastdecode``. Ignored for other codecs. + video_backend: Python to be used for encoding. Only ``"pyav"`` + is currently supported. + extra_options: Free-form dictionary of additional video encoder options + (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``). + """ + + vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ? + pix_fmt: str = "yuv420p" + g: int | None = 2 + crf: int | float | None = 30 + preset: int | str | None = None + fast_decode: int = 0 + # TODO(CarolinePascal): add torchcodec support + find a way to unify the + # two backends (encoding and decoding). + video_backend: str = "pyav" + extra_options: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + self.resolve_vcodec() + # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work". + if self.preset is None and self.vcodec == "libsvtav1": + self.preset = LIBSVTAV1_DEFAULT_PRESET + self.validate() + + @classmethod + def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig: + """Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block. + Missing or ``None`` values fall back to the class defaults. + """ + video_info = video_info or {} + kwargs: dict[str, Any] = {} + + for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")): + value = video_info.get(src_key) + if value is not None: + kwargs[dst_field] = value + + for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES: + value = video_info.get(f"video.{field_name}") + if value is None: + continue + # Persisted as ``{}`` after merges with disagreeing sources — treat as default. + if field_name == "extra_options" and not value: + continue + kwargs[field_name] = value + + return cls(**kwargs) + + def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: + """Return the subset of available encoders based on the specified video backend. + + Args: + encoders: List of encoder names to detect. If a string, it is converted to a list. + Returns: + List of available encoder names. If the video backend is not "pyav", returns an empty list. + """ + if self.video_backend == "pyav": + require_package("av", extra="dataset") + from lerobot.datasets import detect_available_encoders_pyav + + return detect_available_encoders_pyav(encoders) + return [] + + def validate(self) -> None: + """Validate the video encoder configuration.""" + if self.video_backend == "pyav": + require_package("av", extra="dataset") + from lerobot.datasets import check_video_encoder_parameters_pyav + + check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options()) + + def resolve_vcodec(self) -> None: + """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder. + + For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the + resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``. + + Stream-derived canonical codec names listed in :data:`VIDEO_CODECS_ALIASES` are + rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``). + """ + self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec) + if self.vcodec not in VALID_VIDEO_CODECS: + raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") + if self.vcodec == "auto": + available = self.detect_available_encoders(HW_VIDEO_CODECS) + for encoder in HW_VIDEO_CODECS: + if encoder in available: + logger.info(f"Auto-selected video codec: {encoder}") + self.vcodec = encoder + return + logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'") + self.vcodec = "libsvtav1" + + if self.detect_available_encoders(self.vcodec): + logger.info(f"Using video codec: {self.vcodec}") + return + raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}") + + def get_codec_options( + self, encoder_threads: int | None = None, as_strings: bool = False + ) -> dict[str, Any]: + """Translate the tuning fields to codec-specific options. + + ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field. + + Args: + encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs. + For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``. + For h264/hevc, this is mapped to ``threads``. + Hardware encoders ignore this parameter. + as_strings: If ``True``, casts values to strings. + """ + opts: dict[str, Any] = {} + + def set_if(key: str, value: Any) -> None: + if value is not None: + opts[key] = value if not as_strings else str(value) + + # GOP size is not a codec-specific option, so it is always set. + set_if("g", self.g) + + if self.vcodec == "libsvtav1": + set_if("crf", self.crf) + set_if("preset", self.preset) + svtav1_parts: list[str] = [] + if self.fast_decode is not None: + svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}") + if encoder_threads is not None: + svtav1_parts.append(f"lp={encoder_threads}") + if svtav1_parts: + opts["svtav1-params"] = ":".join(svtav1_parts) + elif self.vcodec in ("h264", "hevc"): + set_if("crf", self.crf) + set_if("preset", self.preset) + if self.fast_decode: + opts["tune"] = "fastdecode" + set_if("threads", encoder_threads) + elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"): + if self.crf is not None: + opts["q:v"] = max(1, min(100, 100 - self.crf * 2)) + elif self.vcodec in ("h264_nvenc", "hevc_nvenc"): + opts["rc"] = 0 + set_if("qp", self.crf) + set_if("preset", self.preset) + elif self.vcodec == "h264_vaapi": + set_if("qp", self.crf) + elif self.vcodec == "h264_qsv": + set_if("global_quality", self.crf) + set_if("preset", self.preset) + else: + set_if("crf", self.crf) + set_if("preset", self.preset) + + # Extra options are merged last but never override structured fields (values are kept as given). + for k, v in self.extra_options.items(): + if k not in opts: + set_if(k, v) + + return opts + + +def camera_encoder_defaults() -> VideoEncoderConfig: + """Return a :class:`VideoEncoderConfig` with RGB-camera defaults.""" + return VideoEncoderConfig() diff --git a/src/lerobot/datasets/__init__.py b/src/lerobot/datasets/__init__.py index 6c42959a5..b51ef0222 100644 --- a/src/lerobot/datasets/__init__.py +++ b/src/lerobot/datasets/__init__.py @@ -40,6 +40,7 @@ from .io_utils import load_episodes, write_stats from .lerobot_dataset import LeRobotDataset from .multi_dataset import MultiLeRobotDataset from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features +from .pyav_utils import check_video_encoder_parameters_pyav, detect_available_encoders_pyav from .sampler import EpisodeAwareSampler from .streaming_dataset import StreamingLeRobotDataset from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card @@ -59,6 +60,8 @@ __all__ = [ "MultiLeRobotDataset", "StreamingLeRobotDataset", "VideoEncodingManager", + "check_video_encoder_parameters_pyav", + "detect_available_encoders_pyav", "add_features", "aggregate_datasets", "aggregate_pipeline_dataset_features", diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py index 90fc8f583..5db3f934d 100644 --- a/src/lerobot/datasets/aggregate.py +++ b/src/lerobot/datasets/aggregate.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import shutil from pathlib import Path @@ -23,9 +24,11 @@ import datasets import pandas as pd import tqdm +from lerobot.configs import VIDEO_ENCODER_INFO_KEYS + from .compute_stats import aggregate_stats from .dataset_metadata import LeRobotDatasetMetadata -from .feature_utils import get_hf_features_from_features +from .feature_utils import features_equal_for_merge, get_hf_features_from_features from .io_utils import ( get_file_size_in_mb, get_parquet_file_size_in_mb, @@ -46,11 +49,54 @@ from .utils import ( from .video_utils import concatenate_video_files, get_video_duration_in_s +def merge_video_feature_info_for_aggregate(all_metadata: list[LeRobotDatasetMetadata]) -> dict[str, dict]: + """Create a merged video feature info dictionary for aggregation. The video encoder info is merged field-by-field: each key is kept only when every source agrees; otherwise that key is set to ``null`` (or ``{}`` for ``video.extra_options``) and a warning is logged. + + Args: + all_metadata: List of LeRobotDatasetMetadata objects to merge. + + Returns: + dict: A dictionary of merged video feature info. + """ + merged_info = copy.deepcopy(all_metadata[0].features) + video_keys = [k for k in merged_info if merged_info[k].get("dtype") == "video"] + + for vk in video_keys: + video_infos = [m.features.get(vk, {}).get("info") or {} for m in all_metadata] + base_video_info = video_infos[0] + + merged_encoder_info: dict = {} + fallback_keys: list[str] = [] + for info_key in VIDEO_ENCODER_INFO_KEYS: + values = [info.get(info_key, None) for info in video_infos] + first_value = values[0] + all_match = all(v == first_value for v in values[1:]) + + if all_match: + merged_encoder_info[info_key] = first_value + else: + fallback_keys.append(info_key) + merged_encoder_info[info_key] = {} if info_key == "video.extra_options" else None + + if fallback_keys: + logging.warning( + f"Merging heterogeneous or incomplete video encoder metadata for feature {vk}. " + f"Setting these keys to null: {fallback_keys}.", + ) + + merged_info[vk]["info"] = {**base_video_info, **merged_encoder_info} + # TODO(CarolinePascal): make this variable once we have support for other video backends. + merged_info[vk]["info"]["video.video_backend"] = "pyav" + + return merged_info + + def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]): """Validates that all dataset metadata have consistent properties. Ensures all datasets have the same fps, robot_type, and features to guarantee compatibility when aggregating them into a single dataset. + Video encoder info is not considered for validation but is merged during aggregation in ``merge_video_feature_info_for_aggregate``. Args: all_metadata: List of LeRobotDatasetMetadata objects to validate. @@ -74,7 +120,7 @@ def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]): raise ValueError( f"Same robot_type is expected, but got robot_type={meta.robot_type} instead of {robot_type}." ) - if features != meta.features: + if not features_equal_for_merge(features, meta.features): raise ValueError( f"Same features is expected, but got features={meta.features} instead of {features}." ) @@ -274,7 +320,8 @@ def aggregate_datasets( LeRobotDatasetMetadata(repo_id, root=root) for repo_id, root in zip(repo_ids, roots, strict=False) ] ) - fps, robot_type, features = validate_all_metadata(all_metadata) + fps, robot_type, _ = validate_all_metadata(all_metadata) + features = merge_video_feature_info_for_aggregate(all_metadata) video_keys = [key for key in features if features[key]["dtype"] == "video"] dst_meta = LeRobotDatasetMetadata.create( @@ -332,7 +379,6 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu videos_idx: Dictionary tracking video chunk and file indices. video_files_size_in_mb: Maximum size for video files in MB (defaults to DEFAULT_VIDEO_FILE_SIZE_IN_MB) chunk_size: Maximum number of files per chunk (defaults to DEFAULT_CHUNK_SIZE) - Returns: dict: Updated videos_idx with current chunk and file indices. """ @@ -414,9 +460,11 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu current_dst_duration = dst_file_durations.get(dst_key, 0) videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_dst_duration videos_idx[key]["src_to_dst"][(src_chunk_idx, src_file_idx)] = dst_key + # TODO(CarolinePascal): Move the check before the loop to avoid failing in the middle + add possibility to re-encode the video if the check fails concatenate_video_files( [dst_path, src_path], dst_path, + compatibility_check=True, ) # Update duration of this destination file dst_file_durations[dst_key] = current_dst_duration + src_duration diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py index b404ddb18..3c58774c3 100644 --- a/src/lerobot/datasets/dataset_metadata.py +++ b/src/lerobot/datasets/dataset_metadata.py @@ -24,6 +24,7 @@ import pyarrow as pa import pyarrow.parquet as pq from huggingface_hub import snapshot_download +from lerobot.configs import VideoEncoderConfig from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE from lerobot.utils.feature_utils import _validate_feature_names from lerobot.utils.utils import flatten_dict @@ -534,10 +535,23 @@ class LeRobotDatasetMetadata: self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats write_stats(self.stats, self.root) - def update_video_info(self, video_key: str | None = None) -> None: - """ + def update_video_info( + self, + video_key: str | None = None, + camera_encoder: VideoEncoderConfig | None = None, + ) -> None: + """Populate per-feature video info in ``info.json``. + Warning: this function writes info from first episode videos, implicitly assuming that all videos have been encoded the same way. Also, this means it assumes the first episode exists. + + Args: + video_key: If provided, only update this video key. Otherwise update + all video keys in the dataset. + camera_encoder: Encoder configuration used to produce the + videos. When provided, its fields are recorded as + ``video.`` entries alongside the stream-derived + ``video.*`` entries (see :func:`get_video_info`). """ if video_key is not None and video_key not in self.video_keys: raise ValueError(f"Video key {video_key} not found in dataset") @@ -546,7 +560,7 @@ class LeRobotDatasetMetadata: for key in video_keys: if not self.features[key].get("info", None): video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0) - self.info.features[key]["info"] = get_video_info(video_path) + self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder) def update_chunk_settings( self, diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 46dd9bff2..489914fbc 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -36,6 +36,7 @@ import pyarrow.parquet as pq import torch from tqdm import tqdm +from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE from lerobot.utils.utils import flatten_dict @@ -62,7 +63,10 @@ from .utils import ( DEFAULT_EPISODES_PATH, update_chunk_file_indices, ) -from .video_utils import encode_video_frames, get_video_info +from .video_utils import ( + encode_video_frames, + get_video_info, +) def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict: @@ -95,6 +99,11 @@ def delete_episodes( ) -> LeRobotDataset: """Delete episodes from a LeRobotDataset and create a new dataset. + Video segments that need re-encoding (because the source file mixes kept and + deleted episodes) are re-encoded with the source dataset's existing encoder + settings — read back from ``meta/info.json`` — so the output dataset stays + consistent with its own metadata. + Args: dataset: The source LeRobotDataset. episode_indices: List of episode indices to delete. @@ -157,6 +166,11 @@ def split_dataset( ) -> dict[str, LeRobotDataset]: """Split a LeRobotDataset into multiple smaller datasets. + Video segments that need re-encoding (because the source file mixes episodes + that fall into different splits) are re-encoded with the source dataset's + existing encoder settings — read back from ``meta/info.json`` — so each + output split stays consistent with its own metadata. + Args: dataset: The source LeRobotDataset to split. splits: Either a dict mapping split names to episode indices, or a dict mapping @@ -578,8 +592,7 @@ def _keep_episodes_from_video_with_av( output_path: Path, episodes_to_keep: list[tuple[int, int]], fps: float, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", + camera_encoder: VideoEncoderConfig, ) -> None: """Keep only specified episodes from a video file using PyAV. @@ -593,8 +606,7 @@ def _keep_episodes_from_video_with_av( Ranges are half-open intervals: [start_frame, end_frame), where start_frame is inclusive and end_frame is exclusive. fps: Frame rate of the video. - vcodec: Video codec to use for encoding. - pix_fmt: Pixel format for output video. + camera_encoder: Video encoder settings used to re-encode the kept frames. """ from fractions import Fraction @@ -619,12 +631,13 @@ def _keep_episodes_from_video_with_av( # Convert fps to Fraction for PyAV compatibility. fps_fraction = Fraction(fps).limit_denominator(1000) - v_out = out.add_stream(vcodec, rate=fps_fraction) + codec_options = camera_encoder.get_codec_options(as_strings=True) + v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options) # PyAV type stubs don't distinguish video streams from audio/subtitle streams. v_out.width = v_in.codec_context.width v_out.height = v_in.codec_context.height - v_out.pix_fmt = pix_fmt + v_out.pix_fmt = camera_encoder.pix_fmt # Set time_base to match the frame rate for proper timestamp handling. v_out.time_base = Fraction(1, int(fps)) @@ -687,14 +700,14 @@ def _copy_and_reindex_videos( src_dataset: LeRobotDataset, dst_meta: LeRobotDatasetMetadata, episode_mapping: dict[int, int], - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", ) -> dict[int, dict]: """Copy and filter video files, only re-encoding files with deleted episodes. For video files that only contain kept episodes, we copy them directly. For files with mixed kept/deleted episodes, we use PyAV filters to efficiently - re-encode only the desired segments. + re-encode only the desired segments. The encoder used for re-encoding is + derived per video key from the source dataset's ``meta/info.json`` so the + destination metadata keeps describing the videos accurately. Args: src_dataset: Source dataset to copy from @@ -711,6 +724,9 @@ def _copy_and_reindex_videos( for video_key in src_dataset.meta.video_keys: logging.info(f"Processing videos for {video_key}") + camera_encoder = VideoEncoderConfig.from_video_info( + src_dataset.meta.info.features.get(video_key, {}).get("info") + ) if dst_meta.video_path is None: raise ValueError("Destination metadata has no video_path defined") @@ -792,8 +808,7 @@ def _copy_and_reindex_videos( dst_video_path, episodes_to_keep_ranges, src_dataset.meta.fps, - vcodec, - pix_fmt, + camera_encoder, ) cumulative_ts = 0.0 @@ -1264,11 +1279,7 @@ def _estimate_frame_size_via_calibration( episode_indices: list[int], temp_dir: Path, fps: int, - vcodec: str, - pix_fmt: str, - g: int, - crf: int, - fast_decode: int, + camera_encoder: VideoEncoderConfig, num_calibration_frames: int = 30, ) -> float: """Estimate MB per frame by encoding a small calibration sample. @@ -1282,11 +1293,7 @@ def _estimate_frame_size_via_calibration( episode_indices: List of episode indices being processed. temp_dir: Temporary directory for calibration files. fps: Frames per second for video encoding. - vcodec: Video codec (libsvtav1, h264, hevc). - pix_fmt: Pixel format (yuv420p, etc.). - g: GOP size (group of pictures). - crf: Constant Rate Factor (quality). - fast_decode: Fast decode tuning parameter. + camera_encoder: Video encoder settings used for calibration encoding. num_calibration_frames: Number of frames to use for calibration (default: 30). Returns: @@ -1322,11 +1329,7 @@ def _estimate_frame_size_via_calibration( imgs_dir=calibration_dir, video_path=calibration_video_path, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder=camera_encoder, overwrite=True, ) @@ -1644,11 +1647,7 @@ def convert_image_to_video_dataset( dataset: LeRobotDataset, output_dir: Path | None = None, repo_id: str | None = None, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", - g: int = 2, - crf: int = 30, - fast_decode: int = 0, + camera_encoder: VideoEncoderConfig | None = None, episode_indices: list[int] | None = None, num_workers: int = 4, max_episodes_per_batch: int | None = None, @@ -1663,11 +1662,8 @@ def convert_image_to_video_dataset( dataset: The source LeRobot dataset with images output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. - vcodec: Video codec (default: libsvtav1) - pix_fmt: Pixel format (default: yuv420p) - g: Group of pictures size (default: 2) - crf: Constant rate factor (default: 30) - fast_decode: Fast decode tuning (default: 0) + camera_encoder: Video encoder settings + (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). episode_indices: List of episode indices to convert (None = all episodes) num_workers: Number of threads for parallel processing (default: 4) max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) @@ -1676,6 +1672,9 @@ def convert_image_to_video_dataset( Returns: New LeRobotDataset with images encoded as videos """ + if camera_encoder is None: + camera_encoder = camera_encoder_defaults() + # Check that it's an image dataset if len(dataset.meta.video_keys) > 0: raise ValueError( @@ -1699,7 +1698,10 @@ def convert_image_to_video_dataset( logging.info( f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" ) - logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}") + logging.info( + f"Video codec: {camera_encoder.vcodec}, pixel format: {camera_encoder.pix_fmt}, " + f"GOP: {camera_encoder.g}, CRF: {camera_encoder.crf}" + ) # Create new features dict, converting image features to video features new_features = {} @@ -1769,11 +1771,7 @@ def convert_image_to_video_dataset( episode_indices=episode_indices, temp_dir=temp_dir, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder=camera_encoder, ) logging.info(f"Processing camera: {img_key}") @@ -1815,11 +1813,7 @@ def convert_image_to_video_dataset( imgs_dir=imgs_dir, video_path=video_path, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder=camera_encoder, overwrite=True, ) @@ -1865,7 +1859,9 @@ def convert_image_to_video_dataset( video_path = new_meta.root / new_meta.video_path.format( video_key=img_key, chunk_index=0, file_index=0 ) - new_meta.info.features[img_key]["info"] = get_video_info(video_path) + new_meta.info.features[img_key]["info"] = get_video_info( + video_path, camera_encoder=camera_encoder + ) write_info(new_meta.info, new_meta.root) diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index cf306a86a..6be63194f 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -31,6 +31,8 @@ import PIL.Image import pyarrow.parquet as pq import torch +from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults + from .compute_stats import compute_episode_stats from .dataset_metadata import LeRobotDatasetMetadata from .feature_utils import ( @@ -65,14 +67,19 @@ def _encode_video_worker( episode_index: int, root: Path, fps: int, - vcodec: str = "libsvtav1", + camera_encoder: VideoEncoderConfig | None = None, encoder_threads: int | None = None, ) -> Path: temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) img_dir = (root / fpath).parent encode_video_frames( - img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads + img_dir, + temp_path, + fps, + camera_encoder=camera_encoder, + encoder_threads=encoder_threads, + overwrite=True, ) shutil.rmtree(img_dir) return temp_path @@ -89,20 +96,22 @@ class DatasetWriter: self, meta: LeRobotDatasetMetadata, root: Path, - vcodec: str, + camera_encoder: VideoEncoderConfig | None, encoder_threads: int | None, batch_encoding_size: int, streaming_encoder: StreamingVideoEncoder | None = None, initial_frames: int = 0, ): - """Initialize the writer with metadata, codec, and encoding config. + """Initialize the writer with metadata, codec, and encoder config. Args: meta: Dataset metadata instance (used for feature schema, chunk settings, and episode persistence). root: Local dataset root directory. - vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``). - encoder_threads: Threads per encoder instance. ``None`` for auto. + camera_encoder: Video encoder settings applied to all cameras. + ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder` @@ -111,7 +120,7 @@ class DatasetWriter: """ self._meta = meta self._root = root - self._vcodec = vcodec + self._camera_encoder = camera_encoder or camera_encoder_defaults() self._encoder_threads = encoder_threads self._batch_encoding_size = batch_encoding_size self._streaming_encoder = streaming_encoder @@ -284,7 +293,7 @@ class DatasetWriter: episode_index, self._root, self._meta.fps, - self._vcodec, + self._camera_encoder, self._encoder_threads, ): video_key for video_key in self._meta.video_keys @@ -495,7 +504,7 @@ class DatasetWriter: # Update video info (only needed when first episode is encoded) if episode_index == 0: - self._meta.update_video_info(video_key) + self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder) write_info(self._meta.info, self._meta.root) metadata = { @@ -564,7 +573,12 @@ class DatasetWriter: def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: """Use ffmpeg to convert frames stored as png into mp4 videos.""" return _encode_video_worker( - video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads + video_key, + episode_index, + self._root, + self._meta.fps, + self._camera_encoder, + self._encoder_threads, ) def close_writer(self) -> None: diff --git a/src/lerobot/datasets/feature_utils.py b/src/lerobot/datasets/feature_utils.py index 2ab4b0ea6..d5a550a4c 100644 --- a/src/lerobot/datasets/feature_utils.py +++ b/src/lerobot/datasets/feature_utils.py @@ -19,6 +19,7 @@ import datasets import numpy as np from PIL import Image as PILImage +from lerobot.configs import VIDEO_ENCODER_INFO_KEYS from lerobot.utils.constants import DEFAULT_FEATURES from lerobot.utils.utils import is_valid_numpy_dtype_string @@ -108,6 +109,41 @@ def create_empty_dataset_info( ) +def features_equal_for_merge(features_a: dict[str, dict], features_b: dict[str, dict]) -> bool: + """Return whether two LeRobotDatasetMetadata ``features`` dicts are compatible for aggregation. + + For video features, keys under ``info`` related to video encoding parameters are ignored during + comparison as they do not prevent aggregation. + """ + + def _without_encoder_info_keys(feature: dict) -> dict: + filtered = dict(feature) + filtered_info = filtered.get("info") + if isinstance(filtered_info, dict): + filtered["info"] = { + info_key: info_value + for info_key, info_value in filtered_info.items() + if info_key not in VIDEO_ENCODER_INFO_KEYS + } + return filtered + + if set(features_a) != set(features_b): + return False + for key in features_a: + fa_key = features_a[key] + fb_key = features_b[key] + if fa_key.get("dtype") != fb_key.get("dtype"): + return False + if fa_key.get("dtype") != "video": + if fa_key != fb_key: + return False + continue + + if _without_encoder_info_keys(fa_key) != _without_encoder_info_keys(fb_key): + return False + return True + + def check_delta_timestamps( delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True ) -> bool: diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index ab55aa9f8..9734bcc74 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -24,6 +24,7 @@ import torch.utils from huggingface_hub import HfApi, snapshot_download from huggingface_hub.errors import RevisionNotFoundError +from lerobot.configs import VideoEncoderConfig from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata @@ -36,8 +37,7 @@ from .utils import ( ) from .video_utils import ( StreamingVideoEncoder, - get_safe_default_codec, - resolve_vcodec, + get_safe_default_video_backend, ) logger = logging.getLogger(__name__) @@ -59,10 +59,10 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: str | None = None, return_uint8: bool = False, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", + camera_encoder: VideoEncoderConfig | None = None, + encoder_threads: int | None = None, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, - encoder_threads: int | None = None, ): """ 2 modes are available for instantiating this class, depending on 2 different use cases: @@ -183,16 +183,15 @@ class LeRobotDataset(torch.utils.data.Dataset): You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision. batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos. Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1. - vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc', - 'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'. - Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder. + camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras + (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` + is used by the writer. + encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the + codec decide. streaming_encoding (bool, optional): If True, encode video frames in real-time during capture instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False. encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using streaming encoding. Defaults to 30 (~1s at 30fps). - encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the - codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for - libsvtav1 and 'threads' for h264/hevc. Note: Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to @@ -207,10 +206,9 @@ class LeRobotDataset(torch.utils.data.Dataset): self.delta_timestamps = delta_timestamps self.tolerance_s = tolerance_s self.revision = revision if revision else CODEBASE_VERSION - self._video_backend = video_backend if video_backend else get_safe_default_codec() + self._video_backend = video_backend if video_backend else get_safe_default_video_backend() self._return_uint8 = return_uint8 self._batch_encoding_size = batch_encoding_size - self._vcodec = resolve_vcodec(vcodec) self._encoder_threads = encoder_threads if self._requested_root is not None: @@ -273,12 +271,15 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = None if streaming_encoding and len(self.meta.video_keys) > 0: streaming_enc = self._build_streaming_encoder( - self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads + self.meta.fps, + camera_encoder, + encoder_queue_maxsize, + encoder_threads, ) self.writer = DatasetWriter( meta=self.meta, root=self.root, - vcodec=self._vcodec, + camera_encoder=camera_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -320,17 +321,13 @@ class LeRobotDataset(torch.utils.data.Dataset): @staticmethod def _build_streaming_encoder( fps: int, - vcodec: str, + camera_encoder: VideoEncoderConfig | None, encoder_queue_maxsize: int, encoder_threads: int | None, ) -> StreamingVideoEncoder: return StreamingVideoEncoder( fps=fps, - vcodec=vcodec, - pix_fmt="yuv420p", - g=2, - crf=30, - preset=None, + camera_encoder=camera_encoder, queue_maxsize=encoder_queue_maxsize, encoder_threads=encoder_threads, ) @@ -647,7 +644,7 @@ class LeRobotDataset(torch.utils.data.Dataset): image_writer_threads: int = 0, video_backend: str | None = None, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", + camera_encoder: VideoEncoderConfig | None = None, metadata_buffer_size: int = 10, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -678,20 +675,20 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend (used when reading back). batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. ``1`` means encode immediately. - vcodec: Video codec for encoding. Options include ``'libsvtav1'``, - ``'h264'``, ``'hevc'``, ``'auto'``. + camera_encoder: Video encoder settings for cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. metadata_buffer_size: Number of episode metadata records to buffer before flushing to parquet. streaming_encoding: If ``True``, encode video frames in real-time during capture instead of writing images first. encoder_queue_maxsize: Max buffered frames per camera when using streaming encoding. - encoder_threads: Threads per encoder instance. ``None`` for auto. Returns: A new :class:`LeRobotDataset` in write mode. """ - vcodec = resolve_vcodec(vcodec) obj = cls.__new__(cls) obj.meta = LeRobotDatasetMetadata.create( repo_id=repo_id, @@ -712,23 +709,23 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.image_transforms = None obj.delta_timestamps = None obj.episodes = None - obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec() + obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend() obj._return_uint8 = False obj._batch_encoding_size = batch_encoding_size - obj._vcodec = vcodec obj._encoder_threads = encoder_threads # Reader is lazily created on first access (write-only mode) obj.reader = None - # Create writer streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: - streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads) + streaming_enc = cls._build_streaming_encoder( + fps, camera_encoder, encoder_queue_maxsize, encoder_threads + ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - vcodec=vcodec, + camera_encoder=camera_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -751,12 +748,12 @@ class LeRobotDataset(torch.utils.data.Dataset): force_cache_sync: bool = False, video_backend: str | None = None, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", + camera_encoder: VideoEncoderConfig | None = None, + encoder_threads: int | None = None, image_writer_processes: int = 0, image_writer_threads: int = 0, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, - encoder_threads: int | None = None, ) -> "LeRobotDataset": """Resume recording on an existing dataset. @@ -779,13 +776,15 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend for reading back data. batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. - vcodec: Video codec for encoding. + camera_encoder: Video encoder settings for cameras (codec, quality, etc.). + When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. image_writer_processes: Subprocesses for async image writing. image_writer_threads: Threads for async image writing. streaming_encoding: If ``True``, encode video in real-time during capture. encoder_queue_maxsize: Max buffered frames per camera for streaming. - encoder_threads: Threads per encoder instance. ``None`` for auto. Returns: A :class:`LeRobotDataset` in write mode, ready to append episodes. @@ -796,7 +795,6 @@ class LeRobotDataset(torch.utils.data.Dataset): "Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt " "the shared cache. Please provide a local directory path." ) - vcodec = resolve_vcodec(vcodec) obj = cls.__new__(cls) obj.repo_id = repo_id obj._requested_root = Path(root) @@ -805,11 +803,9 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.image_transforms = None obj.delta_timestamps = None obj.episodes = None - obj._video_backend = video_backend if video_backend else get_safe_default_codec() + obj._video_backend = video_backend if video_backend else get_safe_default_video_backend() obj._return_uint8 = False obj._batch_encoding_size = batch_encoding_size - obj._vcodec = vcodec - obj._encoder_threads = encoder_threads if obj._requested_root is not None: obj._requested_root.mkdir(exist_ok=True, parents=True) @@ -818,21 +814,22 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.meta = LeRobotDatasetMetadata( obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync ) + + obj._encoder_threads = encoder_threads obj.root = obj.meta.root # Reader is lazily created on first access (write-only mode) obj.reader = None - # Create writer for appending streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: streaming_enc = cls._build_streaming_encoder( - obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads + obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - vcodec=vcodec, + camera_encoder=camera_encoder, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py new file mode 100644 index 000000000..d291f8b40 --- /dev/null +++ b/src/lerobot/datasets/pyav_utils.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyAV-based compatibility checks for :class:`VideoEncoderConfig`. + +Centralises all :mod:`av` introspection of the bundled FFmpeg build. +Checks degrade to a no-op when the target codec isn't available locally. +""" + +import functools +import logging +from typing import Any + +import av + +logger = logging.getLogger(__name__) + +FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE") +FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64") + + +@functools.cache +def get_codec(vcodec: str) -> av.codec.Codec | None: + """PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable.""" + try: + return av.codec.Codec(vcodec, "w") + except Exception: + return None + + +@functools.cache +def _get_codec_options_by_name(vcodec: str) -> dict[str, av.option.Option]: + """Private-option name → PyAV ``Option`` for *vcodec* (empty if unavailable).""" + codec = get_codec(vcodec) + if codec is None: + return {} + return {opt.name: opt for opt in codec.descriptor.options} + + +@functools.cache +def _get_codec_video_formats(vcodec: str) -> tuple[str, ...]: + """Pixel formats accepted by *vcodec* in PyAV's preferred order (empty if unknown).""" + codec = get_codec(vcodec) + if codec is None: + return () + return tuple(fmt.name for fmt in (codec.video_formats or [])) + + +def detect_available_encoders_pyav(encoders: list[str] | str) -> list[str]: + """Return the subset of *encoders* available as video encoders in the local FFmpeg build. + + Each name is probed directly via :func:`get_codec`; input order is preserved. + """ + if isinstance(encoders, str): + encoders = [encoders] + + available: list[str] = [] + for name in encoders: + codec = get_codec(name) + if codec is not None and codec.type == "video": + available.append(name) + else: + logger.debug("encoder '%s' not available as video encoder", name) + return available + + +def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Option) -> None: + """Range-check numeric *value* and choice-check string *value* against *opt*.""" + type_name = opt.type.name + if type_name in FFMPEG_NUMERIC_OPTION_TYPES: + if isinstance(value, bool): + raise ValueError( + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) + elif isinstance(value, str): + try: + num_val = float(value) + except ValueError as e: + raise ValueError( + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) from e + elif isinstance(value, (float, int)): + num_val = value + else: + raise ValueError( + f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option." + ) + + # Check integer type compatibility + if type_name in FFMPEG_INTEGER_OPTION_TYPES and not num_val.is_integer(): + raise ValueError( + f"{label}={num_val!r} must be an integer for codec {vcodec!r} " + f"(FFmpeg option {opt.name!r} is {type_name}); float values are not allowed." + ) + + # Check numeric range compatibility + lo, hi = float(opt.min), float(opt.max) + if lo < hi and not (lo <= num_val <= hi): + raise ValueError( + f"{label}={num_val} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]" + ) + + elif type_name == "STRING": + if isinstance(value, bool): + raise ValueError(f"{label}={value!r} is not a valid string value for codec {vcodec!r}.") + if isinstance(value, str): + str_val = value + elif isinstance(value, (int, float)): + str_val = str(value) + else: + raise ValueError(f"{label}={value!r} has unsupported type for STRING option on codec {vcodec!r}") + + # Check string choice compatibility + choices = [c.name for c in (opt.choices or [])] + if choices and str_val not in choices: + raise ValueError( + f"{label}={str_val!r} is not a supported choice for codec " + f"{vcodec!r}; valid choices: {choices}" + ) + else: + return + + +def _check_pixel_format(vcodec: str, pix_fmt: str) -> None: + formats = _get_codec_video_formats(vcodec) + if formats and pix_fmt not in formats: + raise ValueError( + f"pix_fmt={pix_fmt!r} is not supported by codec {vcodec!r}; " + f"supported pixel formats: {list(formats)}" + ) + + +def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None: + """Validate merged encoder options (typed) against the codec's published AVOptions.""" + supported_options = _get_codec_options_by_name(vcodec) + for key, value in codec_options.items(): + # GOP size is not a codec-specific option, it has to be validated separately. + if key == "g": + if isinstance(value, bool) or not isinstance(value, int) or value < 1: + raise ValueError(f"g={value!r} must be a positive integer for codec {vcodec!r}") + continue + if key not in supported_options: + continue + _check_option_value(vcodec, key, value, supported_options[key]) + + +def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options: dict[str, Any]) -> None: + """Verify *config* is compatible with the bundled FFmpeg build. + + Checks pixel format, abstract tuning-field compatibility, and each merged + encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options` + against PyAV (including numeric ``extra_options`` present in that dict). + No-op when ``config.vcodec`` isn't in the local FFmpeg build. + + Raises: + ValueError: on the first incompatibility encountered. + """ + options = _get_codec_options_by_name(vcodec) + if not options: + raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build") + _check_pixel_format(vcodec, pix_fmt) + _check_codec_options(vcodec, codec_options) diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index 00ff09ee7..e823a406c 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -22,7 +22,7 @@ import shutil import tempfile import threading import warnings -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from fractions import Fraction from pathlib import Path from threading import Lock @@ -36,86 +36,14 @@ import torch from datasets.features.features import register_feature from PIL import Image -from lerobot.utils.import_utils import get_safe_default_codec +from lerobot.configs import ( + VideoEncoderConfig, + camera_encoder_defaults, +) +from lerobot.utils.import_utils import get_safe_default_video_backend logger = logging.getLogger(__name__) -# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build. -# Determines the order of preference for auto-selection when vcodec="auto" is used. -HW_ENCODERS = [ - "h264_videotoolbox", # macOS - "hevc_videotoolbox", # macOS - "h264_nvenc", # NVIDIA GPU - "hevc_nvenc", # NVIDIA GPU - "h264_vaapi", # Linux Intel/AMD - "h264_qsv", # Intel Quick Sync -] - -VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS) - - -def _get_codec_options( - vcodec: str, - g: int | None = 2, - crf: int | None = 30, - preset: int | None = None, -) -> dict: - """Build codec-specific options dict for video encoding.""" - options = {} - - # GOP size (keyframe interval) - supported by VideoToolbox and software encoders - if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS): - options["g"] = str(g) - - # Quality control (codec-specific parameter names) - if crf is not None: - if vcodec in ("h264", "hevc", "libsvtav1"): - options["crf"] = str(crf) - elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"): - quality = max(1, min(100, int(100 - crf * 2))) - options["q:v"] = str(quality) - elif vcodec in ("h264_nvenc", "hevc_nvenc"): - options["rc"] = "constqp" - options["qp"] = str(crf) - elif vcodec in ("h264_vaapi",): - options["qp"] = str(crf) - elif vcodec in ("h264_qsv",): - options["global_quality"] = str(crf) - - # Preset (only for libsvtav1) - if vcodec == "libsvtav1": - options["preset"] = str(preset) if preset is not None else "12" - - return options - - -def detect_available_hw_encoders() -> list[str]: - """Probe PyAV/FFmpeg for available hardware video encoders.""" - available = [] - for codec_name in HW_ENCODERS: - try: - av.codec.Codec(codec_name, "w") - available.append(codec_name) - except Exception: # nosec B110 - logger.debug("HW encoder '%s' not available", codec_name) # nosec B110 - return available - - -def resolve_vcodec(vcodec: str) -> str: - """Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1.""" - if vcodec not in VALID_VIDEO_CODECS: - raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") - if vcodec != "auto": - logger.info(f"Using video codec: {vcodec}") - return vcodec - available = detect_available_hw_encoders() - for encoder in HW_ENCODERS: - if encoder in available: - logger.info(f"Auto-selected video codec: {encoder}") - return encoder - logger.info("No hardware encoder available, falling back to software encoder 'libsvtav1'") - return "libsvtav1" - def decode_video_frames( video_path: Path | str, @@ -143,7 +71,7 @@ def decode_video_frames( Currently supports torchcodec on cpu and pyav. """ if backend is None: - backend = get_safe_default_codec() + backend = get_safe_default_video_backend() if backend == "torchcodec": return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8) elif backend == "pyav": @@ -407,18 +335,17 @@ def encode_video_frames( imgs_dir: Path | str, video_path: Path | str, fps: int, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", - g: int | None = 2, - crf: int | None = 30, - fast_decode: int = 0, + camera_encoder: VideoEncoderConfig | None = None, + encoder_threads: int | None = None, + *, log_level: int | None = av.logging.WARNING, overwrite: bool = False, - preset: int | None = None, - encoder_threads: int | None = None, ) -> None: """More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" - vcodec = resolve_vcodec(vcodec) + if camera_encoder is None: + camera_encoder = camera_encoder_defaults() + vcodec = camera_encoder.vcodec + pix_fmt = camera_encoder.pix_fmt video_path = Path(video_path) imgs_dir = Path(imgs_dir) @@ -429,42 +356,18 @@ def encode_video_frames( video_path.parent.mkdir(parents=True, exist_ok=True) - # Encoders/pixel formats incompatibility check - if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p": - logger.warning( - f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'" - ) - pix_fmt = "yuv420p" - # Get input frames template = "frame-" + ("[0-9]" * 6) + ".png" input_list = sorted( glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0]) ) - # Define video output frame size (assuming all input frames are the same size) if len(input_list) == 0: raise FileNotFoundError(f"No images found in {imgs_dir}.") with Image.open(input_list[0]) as dummy_image: width, height = dummy_image.size - # Define video codec options - video_options = _get_codec_options(vcodec, g, crf, preset) - - if fast_decode: - key = "svtav1-params" if vcodec == "libsvtav1" else "tune" - value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode" - video_options[key] = value - - if encoder_threads is not None: - if vcodec == "libsvtav1": - lp_param = f"lp={encoder_threads}" - if "svtav1-params" in video_options: - video_options["svtav1-params"] += f":{lp_param}" - else: - video_options["svtav1-params"] = lp_param - else: - video_options["threads"] = str(encoder_threads) + video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True) # Set logging level if log_level is not None: @@ -501,7 +404,10 @@ def encode_video_frames( def concatenate_video_files( - input_video_paths: list[Path | str], output_video_path: Path, overwrite: bool = True + input_video_paths: list[Path | str], + output_video_path: Path, + overwrite: bool = True, + compatibility_check: bool = False, ): """ Concatenate multiple video files into a single video file using pyav. @@ -514,6 +420,7 @@ def concatenate_video_files( input_video_paths: Ordered list of input video file paths to concatenate. output_video_path: Path to the output video file. overwrite: Whether to overwrite the output video file if it already exists. Default is True. + compatibility_check: Whether to check if the input videos are compatible. Default is False. Note: - Creates a temporary directory for intermediate files that is cleaned up after use. @@ -532,6 +439,22 @@ def concatenate_video_files( if len(input_video_paths) == 0: raise FileNotFoundError("No input video paths provided.") + # This check may be skipped at recording time as videos are encoded with the same encoder config. + if compatibility_check: + reference_video_info = get_video_info(input_video_paths[0]) + for input_path in input_video_paths[1:]: + video_info = get_video_info(input_path) + if ( + video_info["video.height"] != reference_video_info["video.height"] + or video_info["video.width"] != reference_video_info["video.width"] + or video_info["video.fps"] != reference_video_info["video.fps"] + or video_info["video.codec"] != reference_video_info["video.codec"] + or video_info["video.pix_fmt"] != reference_video_info["video.pix_fmt"] + ): + raise ValueError( + f"Input video {input_path} is not compatible with the reference video {input_video_paths[0]}." + ) + # Create a temporary .ffconcat file to list the input video paths with tempfile.NamedTemporaryFile(mode="w", suffix=".ffconcat", delete=False) as tmp_concatenate_file: tmp_concatenate_file.write("ffconcat version 1.0\n") @@ -598,26 +521,20 @@ class _CameraEncoderThread(threading.Thread): fps: int, vcodec: str, pix_fmt: str, - g: int | None, - crf: int | None, - preset: int | None, + codec_options: dict[str, str], frame_queue: queue.Queue, result_queue: queue.Queue, stop_event: threading.Event, - encoder_threads: int | None = None, ): super().__init__(daemon=True) self.video_path = video_path self.fps = fps self.vcodec = vcodec self.pix_fmt = pix_fmt - self.g = g - self.crf = crf - self.preset = preset + self.codec_options = codec_options self.frame_queue = frame_queue self.result_queue = result_queue self.stop_event = stop_event - self.encoder_threads = encoder_threads def run(self) -> None: from .compute_stats import RunningQuantileStats, auto_downsample_height_width @@ -653,19 +570,9 @@ class _CameraEncoderThread(threading.Thread): # Open container on first frame (to get width/height) if container is None: height, width = frame_data.shape[:2] - video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset) - if self.encoder_threads is not None: - if self.vcodec == "libsvtav1": - lp_param = f"lp={self.encoder_threads}" - if "svtav1-params" in video_options: - video_options["svtav1-params"] += f":{lp_param}" - else: - video_options["svtav1-params"] = lp_param - else: - video_options["threads"] = str(self.encoder_threads) Path(self.video_path).parent.mkdir(parents=True, exist_ok=True) container = av.open(str(self.video_path), "w") - output_stream = container.add_stream(self.vcodec, self.fps, options=video_options) + output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options) output_stream.pix_fmt = self.pix_fmt output_stream.width = width output_stream.height = height @@ -731,22 +638,24 @@ class StreamingVideoEncoder: def __init__( self, fps: int, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", - g: int | None = 2, - crf: int | None = 30, - preset: int | None = None, + camera_encoder: VideoEncoderConfig | None = None, queue_maxsize: int = 30, encoder_threads: int | None = None, ): + """ + Args: + fps: Frames per second for the output videos. + camera_encoder: Video encoder settings applied to all cameras. + When ``None``, :func:`camera_encoder_defaults` is used. + encoder_threads: Number of encoder threads (global setting). + ``None`` lets the codec decide. + queue_maxsize: Max frames to buffer per camera before + back-pressure drops frames. + """ self.fps = fps - self.vcodec = resolve_vcodec(vcodec) - self.pix_fmt = pix_fmt - self.g = g - self.crf = crf - self.preset = preset + self._camera_encoder = camera_encoder or camera_encoder_defaults() + self._encoder_threads = encoder_threads self.queue_maxsize = queue_maxsize - self.encoder_threads = encoder_threads self._frame_queues: dict[str, queue.Queue] = {} self._result_queues: dict[str, queue.Queue] = {} @@ -777,18 +686,17 @@ class StreamingVideoEncoder: temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir)) video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4" + vcodec = self._camera_encoder.vcodec + codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=self.fps, - vcodec=self.vcodec, - pix_fmt=self.pix_fmt, - g=self.g, - crf=self.crf, - preset=self.preset, + vcodec=vcodec, + pix_fmt=self._camera_encoder.pix_fmt, + codec_options=codec_options, frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, - encoder_threads=self.encoder_threads, ) encoder_thread.start() @@ -993,8 +901,18 @@ def get_audio_info(video_path: Path | str) -> dict: return audio_info -def get_video_info(video_path: Path | str) -> dict: - # Set logging level +def get_video_info( + video_path: Path | str, + camera_encoder: VideoEncoderConfig | None = None, +) -> dict: + """Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``. + + Args: + video_path: Path to the encoded video file to probe. + camera_encoder: If provided, record the exact encoder settings used to encode this + video. Stream-derived values take precedence — encoder fields are only written for keys + not already populated from the video file itself. + """ logging.getLogger("libav").setLevel(av.logging.WARNING) # Getting video stream information @@ -1025,6 +943,14 @@ def get_video_info(video_path: Path | str) -> dict: # Adding audio stream information video_info.update(**get_audio_info(video_path)) + # Add additional encoder configuration if provided + if camera_encoder is not None: + for field_name, field_value in asdict(camera_encoder).items(): + # vcodec is already populated from the video stream + if field_name == "vcodec": + continue + video_info.setdefault(f"video.{field_name}", field_value) + return video_info diff --git a/src/lerobot/policies/eo1/modeling_eo1.py b/src/lerobot/policies/eo1/modeling_eo1.py index 27d609ec1..1c5860de5 100644 --- a/src/lerobot/policies/eo1/modeling_eo1.py +++ b/src/lerobot/policies/eo1/modeling_eo1.py @@ -28,11 +28,12 @@ import torch.nn.functional as F # noqa: N812 import torch.utils.checkpoint from torch import Tensor -from lerobot.policies.eo1.configuration_eo1 import EO1Config -from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.utils.constants import ACTION, OBS_STATE from lerobot.utils.import_utils import _transformers_available, require_package +from ..pretrained import PreTrainedPolicy +from .configuration_eo1 import EO1Config + if TYPE_CHECKING or _transformers_available: from transformers.activations import ACT2FN from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration diff --git a/src/lerobot/policies/eo1/processor_eo1.py b/src/lerobot/policies/eo1/processor_eo1.py index 2d7bb48ae..b1f32756a 100644 --- a/src/lerobot/policies/eo1/processor_eo1.py +++ b/src/lerobot/policies/eo1/processor_eo1.py @@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Any import torch from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature -from lerobot.policies.eo1.configuration_eo1 import EO1Config from lerobot.processor import ( AddBatchDimensionProcessorStep, ComplementaryDataProcessorStep, @@ -44,6 +43,8 @@ from lerobot.utils.constants import ( ) from lerobot.utils.import_utils import _transformers_available, require_package +from .configuration_eo1 import EO1Config + if TYPE_CHECKING or _transformers_available: from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor else: diff --git a/src/lerobot/rewards/classifier/modeling_classifier.py b/src/lerobot/rewards/classifier/modeling_classifier.py index 1d8057135..ca02b532f 100644 --- a/src/lerobot/rewards/classifier/modeling_classifier.py +++ b/src/lerobot/rewards/classifier/modeling_classifier.py @@ -17,10 +17,11 @@ import logging import torch from torch import Tensor, nn -from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig -from lerobot.rewards.pretrained import PreTrainedRewardModel from lerobot.utils.constants import OBS_IMAGE, REWARD +from ..pretrained import PreTrainedRewardModel +from .configuration_classifier import RewardClassifierConfig + class ClassifierOutput: """Wrapper for classifier outputs with additional metadata.""" diff --git a/src/lerobot/rewards/classifier/processor_classifier.py b/src/lerobot/rewards/classifier/processor_classifier.py index 056d7e91b..a5f609d0c 100644 --- a/src/lerobot/rewards/classifier/processor_classifier.py +++ b/src/lerobot/rewards/classifier/processor_classifier.py @@ -25,7 +25,8 @@ from lerobot.processor import ( policy_action_to_transition, transition_to_policy_action, ) -from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig + +from .configuration_classifier import RewardClassifierConfig def make_classifier_processor( diff --git a/src/lerobot/rewards/factory.py b/src/lerobot/rewards/factory.py index f6716f3fb..c173f44a5 100644 --- a/src/lerobot/rewards/factory.py +++ b/src/lerobot/rewards/factory.py @@ -22,9 +22,10 @@ import torch from lerobot.configs.rewards import RewardModelConfig from lerobot.processor import PolicyAction, PolicyProcessorPipeline -from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig -from lerobot.rewards.pretrained import PreTrainedRewardModel -from lerobot.rewards.sarm.configuration_sarm import SARMConfig + +from .classifier.configuration_classifier import RewardClassifierConfig +from .pretrained import PreTrainedRewardModel +from .sarm.configuration_sarm import SARMConfig def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]: diff --git a/src/lerobot/rewards/sarm/compute_rabc_weights.py b/src/lerobot/rewards/sarm/compute_rabc_weights.py index b1bf2e1f5..bdbb0d297 100644 --- a/src/lerobot/rewards/sarm/compute_rabc_weights.py +++ b/src/lerobot/rewards/sarm/compute_rabc_weights.py @@ -58,9 +58,10 @@ import torch from tqdm import tqdm from lerobot.datasets import LeRobotDataset -from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel -from lerobot.rewards.sarm.processor_sarm import make_sarm_pre_post_processors -from lerobot.rewards.sarm.sarm_utils import normalize_stage_tau + +from .modeling_sarm import SARMRewardModel +from .processor_sarm import make_sarm_pre_post_processors +from .sarm_utils import normalize_stage_tau def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None: diff --git a/src/lerobot/rewards/sarm/modeling_sarm.py b/src/lerobot/rewards/sarm/modeling_sarm.py index 365f519b2..5ebd42d30 100644 --- a/src/lerobot/rewards/sarm/modeling_sarm.py +++ b/src/lerobot/rewards/sarm/modeling_sarm.py @@ -32,13 +32,14 @@ import torch.nn as nn import torch.nn.functional as F # noqa: N812 from torch import Tensor -from lerobot.rewards.pretrained import PreTrainedRewardModel -from lerobot.rewards.sarm.configuration_sarm import SARMConfig -from lerobot.rewards.sarm.sarm_utils import ( +from lerobot.utils.constants import OBS_STR + +from ..pretrained import PreTrainedRewardModel +from .configuration_sarm import SARMConfig +from .sarm_utils import ( normalize_stage_tau, pad_state_to_max_dim, ) -from lerobot.utils.constants import OBS_STR class StageTransformer(nn.Module): diff --git a/src/lerobot/rewards/sarm/processor_sarm.py b/src/lerobot/rewards/sarm/processor_sarm.py index eaa5f66f5..37db374d4 100644 --- a/src/lerobot/rewards/sarm/processor_sarm.py +++ b/src/lerobot/rewards/sarm/processor_sarm.py @@ -58,15 +58,16 @@ from lerobot.processor import ( policy_action_to_transition, transition_to_policy_action, ) -from lerobot.rewards.sarm.configuration_sarm import SARMConfig -from lerobot.rewards.sarm.sarm_utils import ( +from lerobot.types import EnvTransition, PolicyAction, TransitionKey +from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME + +from .configuration_sarm import SARMConfig +from .sarm_utils import ( apply_rewind_augmentation, compute_absolute_indices, find_stage_and_tau, pad_state_to_max_dim, ) -from lerobot.types import EnvTransition, PolicyAction, TransitionKey -from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME class SARMEncodingProcessorStep(ProcessorStep): diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py index 8804cd789..bf5fa0fd4 100644 --- a/src/lerobot/rollout/context.py +++ b/src/lerobot/rollout/context.py @@ -332,7 +332,7 @@ def build_rollout_context( cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder=cfg.dataset.camera_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads, @@ -367,7 +367,7 @@ def build_rollout_context( image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras if hasattr(robot, "cameras") else []), batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder=cfg.dataset.camera_encoder, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_threads=cfg.dataset.encoder_threads, diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index a708d37a3..eb6a57870 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -187,12 +187,12 @@ import abc import logging import shutil import sys -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path import draccus -from lerobot.configs import parser +from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser from lerobot.datasets import ( LeRobotDataset, convert_image_to_video_dataset, @@ -250,11 +250,7 @@ class ModifyTasksConfig(OperationConfig): @dataclass class ConvertImageToVideoConfig(OperationConfig): output_dir: str | None = None - vcodec: str = "libsvtav1" - pix_fmt: str = "yuv420p" - g: int = 2 - crf: int = 30 - fast_decode: int = 0 + camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) episode_indices: list[int] | None = None num_workers: int = 4 max_episodes_per_batch: int | None = None @@ -557,11 +553,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: dataset=dataset, output_dir=output_dir, repo_id=output_repo_id, - vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"), - pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"), - g=getattr(cfg.operation, "g", 2), - crf=getattr(cfg.operation, "crf", 30), - fast_decode=getattr(cfg.operation, "fast_decode", 0), + camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(), episode_indices=getattr(cfg.operation, "episode_indices", None), num_workers=getattr(cfg.operation, "num_workers", 4), max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py index 129696bd3..c8419cb14 100644 --- a/src/lerobot/scripts/lerobot_record.py +++ b/src/lerobot/scripts/lerobot_record.py @@ -63,6 +63,27 @@ lerobot-record \\ --dataset.streaming_encoding=true \\ --dataset.encoder_threads=2 ``` + +Example recording with custom video encoding parameters: +```shell +lerobot-record \\ + --robot.type=so100_follower \\ + --robot.port=/dev/tty.usbmodem58760431541 \\ + --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\ + --robot.id=black \\ + --teleop.type=so100_leader \\ + --teleop.port=/dev/tty.usbmodem58760431551 \\ + --teleop.id=blue \\ + --dataset.repo_id=/ \\ + --dataset.num_episodes=2 \\ + --dataset.single_task="Grab the cube" \\ + --dataset.streaming_encoding=true \\ + --dataset.encoder_threads=2 \\ + --dataset.camera_encoder.vcodec=h264 \\ + --dataset.camera_encoder.preset=fast \\ + --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\ + --display_data=true +``` """ import logging @@ -377,10 +398,10 @@ def record( cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder=cfg.dataset.camera_encoder, + encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, - encoder_threads=cfg.dataset.encoder_threads, image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras if num_cameras > 0 @@ -406,10 +427,10 @@ def record( image_writer_processes=cfg.dataset.num_image_writer_processes, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras), batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder=cfg.dataset.camera_encoder, + encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, - encoder_threads=cfg.dataset.encoder_threads, ) robot.connect() @@ -420,7 +441,7 @@ def record( if not cfg.dataset.streaming_encoding: logging.info( - "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" + "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" ) with VideoEncodingManager(dataset): diff --git a/src/lerobot/scripts/lerobot_rollout.py b/src/lerobot/scripts/lerobot_rollout.py index 6a81563ee..7015e707c 100644 --- a/src/lerobot/scripts/lerobot_rollout.py +++ b/src/lerobot/scripts/lerobot_rollout.py @@ -120,6 +120,18 @@ Usage examples --dataset.repo_id=user/rollout_sentry_data \\ --dataset.single_task="patrol" \\ --resume=true + + # Rollout with custom video encoding parameters + lerobot-rollout \\ + --strategy.type=base \\ + --policy.path=lerobot/act_koch_real \\ + --robot.type=koch_follower \\ + --robot.port=/dev/ttyACM0 \\ + --task="pick up cube" --duration=60 \\ + --display_data=true \\ + --dataset.camera_encoder.vcodec=h264 \\ + --dataset.camera_encoder.preset=fast \\ + --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} """ import logging diff --git a/src/lerobot/transport/utils.py b/src/lerobot/transport/utils.py index 8da338044..2ef63c2cc 100644 --- a/src/lerobot/transport/utils.py +++ b/src/lerobot/transport/utils.py @@ -25,9 +25,10 @@ from typing import Any import torch -from lerobot.transport import services_pb2 from lerobot.utils.transition import Transition +from . import services_pb2 + # FIX for protobuf: Assign the enum to a variable and ignore the type error once TransferState = services_pb2.TransferState # type: ignore[attr-defined] diff --git a/src/lerobot/utils/import_utils.py b/src/lerobot/utils/import_utils.py index 6ba912bf5..ef03367eb 100644 --- a/src/lerobot/utils/import_utils.py +++ b/src/lerobot/utils/import_utils.py @@ -69,7 +69,7 @@ def is_package_available( return package_exists -def get_safe_default_codec(): +def get_safe_default_video_backend(): logger = logging.getLogger(__name__) if importlib.util.find_spec("torchcodec"): return "torchcodec" @@ -128,6 +128,9 @@ _hidapi_available = is_package_available("hidapi", import_name="hid") _pandas_available = is_package_available("pandas") _faker_available = is_package_available("faker") +# Video encoding / decoding +_av_available = is_package_available("av") + # Misc _pynput_available = is_package_available("pynput") _pygame_available = is_package_available("pygame") diff --git a/tests/artifacts/encoded_videos/clip_32x48.mp4 b/tests/artifacts/encoded_videos/clip_32x48.mp4 new file mode 100644 index 000000000..086c399d3 --- /dev/null +++ b/tests/artifacts/encoded_videos/clip_32x48.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2191cd86e9e32ecbe18e33ad68d49060e479723ab5a3212bbb26df3025ccb568 +size 5815 diff --git a/tests/artifacts/encoded_videos/clip_4frames.mp4 b/tests/artifacts/encoded_videos/clip_4frames.mp4 new file mode 100644 index 000000000..487c3c8ad --- /dev/null +++ b/tests/artifacts/encoded_videos/clip_4frames.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e0ebf563ba3ed9c24b691a0f0b29e0294a1fa9b51422e1ece296155f1465768 +size 16236 diff --git a/tests/artifacts/encoded_videos/clip_5frames.mp4 b/tests/artifacts/encoded_videos/clip_5frames.mp4 new file mode 100644 index 000000000..cbbe81c39 --- /dev/null +++ b/tests/artifacts/encoded_videos/clip_5frames.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8475bfd5e6c4c780df46200e2b027e262b38436c57d01078bd943a5b87c65b8f +size 20726 diff --git a/tests/artifacts/encoded_videos/clip_6frames.mp4 b/tests/artifacts/encoded_videos/clip_6frames.mp4 new file mode 100644 index 000000000..50d9badca --- /dev/null +++ b/tests/artifacts/encoded_videos/clip_6frames.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6434322d1c671a7d132367619f841a775317cb9ff973f3f4505831e3ed74076d +size 23808 diff --git a/tests/artifacts/encoded_videos/clip_h264.mp4 b/tests/artifacts/encoded_videos/clip_h264.mp4 new file mode 100644 index 000000000..90698dcf5 --- /dev/null +++ b/tests/artifacts/encoded_videos/clip_h264.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8efc84375e92a3499cef93100e04d8fb354670f3d9e0db2097b52575927284fc +size 12237 diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py index 6d646d4f7..80a95aa1f 100644 --- a/tests/datasets/test_aggregate.py +++ b/tests/datasets/test_aggregate.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import logging from unittest.mock import patch import pytest @@ -23,7 +25,9 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da import datasets # noqa: E402 import torch +from lerobot.configs import VIDEO_ENCODER_INFO_KEYS from lerobot.datasets.aggregate import aggregate_datasets +from lerobot.datasets.feature_utils import features_equal_for_merge from lerobot.datasets.lerobot_dataset import LeRobotDataset from tests.fixtures.constants import DUMMY_REPO_ID @@ -117,8 +121,9 @@ def assert_metadata_consistency(aggr_ds, ds_0, ds_1): "Robot type should be the same" ) - # Test features are the same - assert aggr_ds.features == ds_0.features == ds_1.features, "Features should be the same" + # Schema matches; merged video ``info`` is reconciled separately from per-source ``info``. + assert features_equal_for_merge(aggr_ds.features, ds_0.features) + assert features_equal_for_merge(aggr_ds.features, ds_1.features) # Test tasks aggregation expected_tasks = set(ds_0.meta.tasks.index) | set(ds_1.meta.tasks.index) @@ -284,6 +289,73 @@ def test_aggregate_datasets(tmp_path, lerobot_dataset_factory): assert_dataset_iteration_works(aggr_ds) +@pytest.mark.parametrize("mutation", ["mismatched_value", "missing_key"]) +def test_aggregate_incomplete_video_encoder_info_warns_and_nuls_encoders( + tmp_path, lerobot_dataset_factory, caplog, mutation +): + """Mismatched or missing encoder ``info`` is merged per-key with fallbacks and a warning.""" + suffix = "enc_mismatch" if mutation == "mismatched_value" else "enc_missing" + ds_0 = lerobot_dataset_factory( + root=tmp_path / f"{suffix}_a", + repo_id=f"{DUMMY_REPO_ID}_{suffix}_a", + total_episodes=2, + total_frames=20, + ) + ds_1 = lerobot_dataset_factory( + root=tmp_path / f"{suffix}_b", + repo_id=f"{DUMMY_REPO_ID}_{suffix}_b", + total_episodes=2, + total_frames=20, + ) + + info_path = ds_1.root / "meta" / "info.json" + data = json.loads(info_path.read_text()) + for ft in data["features"].values(): + if ft.get("dtype") != "video": + continue + inf = ft.setdefault("info", {}) + if mutation == "mismatched_value": + inf["video.crf"] = 99 + inf["video.extra_options"] = {"tune": "film"} + else: + inf.pop("video.crf", None) + inf.pop("video.extra_options", None) + info_path.write_text(json.dumps(data)) + + aggr_id = f"{DUMMY_REPO_ID}_{suffix}_aggr" + aggr_root = tmp_path / f"{suffix}_aggr" + with caplog.at_level(logging.WARNING): + aggregate_datasets( + repo_ids=[ds_0.repo_id, ds_1.repo_id], + roots=[ds_0.root, ds_1.root], + aggr_repo_id=aggr_id, + aggr_root=aggr_root, + ) + + assert "heterogeneous" in caplog.text.lower() or "incomplete" in caplog.text.lower() + + with ( + patch("lerobot.datasets.dataset_metadata.get_safe_version") as mock_get_safe_version, + patch("lerobot.datasets.dataset_metadata.snapshot_download") as mock_snapshot_download, + ): + mock_get_safe_version.return_value = "v3.0" + mock_snapshot_download.return_value = str(aggr_root) + aggr_ds = LeRobotDataset(aggr_id, root=aggr_root) + + for key, ft in aggr_ds.meta.info.features.items(): + if ft.get("dtype") != "video": + continue + info = ft["info"] + reference = ds_0.meta.info.features[key]["info"] + for info_key in VIDEO_ENCODER_INFO_KEYS: + if info_key == "video.crf": + assert info[info_key] is None + elif info_key == "video.extra_options": + assert info[info_key] == {} + else: + assert info[info_key] == reference[info_key] + + def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory): """Test aggregation with small file size limits to force file rotation/sharding.""" ds_0_num_episodes = ds_1_num_episodes = 10 diff --git a/tests/datasets/test_dataset_reader.py b/tests/datasets/test_dataset_reader.py index bbe858b5d..085563bb8 100644 --- a/tests/datasets/test_dataset_reader.py +++ b/tests/datasets/test_dataset_reader.py @@ -20,7 +20,7 @@ import pytest pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])") from lerobot.datasets.dataset_reader import DatasetReader -from lerobot.utils.import_utils import get_safe_default_codec +from lerobot.utils.import_utils import get_safe_default_video_backend # ── Loading ────────────────────────────────────────────────────────── @@ -35,7 +35,7 @@ def test_try_load_returns_true_when_data_exists(tmp_path, lerobot_dataset_factor root=dataset.root, episodes=None, tolerance_s=1e-4, - video_backend=get_safe_default_codec(), + video_backend=get_safe_default_video_backend(), delta_timestamps=None, image_transforms=None, ) @@ -58,7 +58,7 @@ def test_try_load_returns_false_when_no_data(tmp_path): root=meta.root, episodes=None, tolerance_s=1e-4, - video_backend=get_safe_default_codec(), + video_backend=get_safe_default_video_backend(), delta_timestamps=None, image_transforms=None, ) diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py index 0b0862f00..032fd4f7c 100644 --- a/tests/datasets/test_dataset_tools.py +++ b/tests/datasets/test_dataset_tools.py @@ -23,8 +23,10 @@ import torch pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])") +from lerobot.configs import VideoEncoderConfig from lerobot.datasets.dataset_tools import ( add_features, + convert_image_to_video_dataset, delete_episodes, merge_datasets, modify_features, @@ -32,7 +34,6 @@ from lerobot.datasets.dataset_tools import ( remove_feature, split_dataset, ) -from lerobot.scripts.lerobot_edit_dataset import convert_image_to_video_dataset @pytest.fixture @@ -1246,10 +1247,12 @@ def test_convert_image_to_video_dataset(tmp_path): dataset=source_dataset, output_dir=output_dir, repo_id="lerobot/pusht_video", - vcodec="libsvtav1", - pix_fmt="yuv420p", - g=2, - crf=30, + camera_encoder=VideoEncoderConfig( + vcodec="libsvtav1", + pix_fmt="yuv420p", + g=2, + crf=30, + ), episode_indices=[0, 1], num_workers=2, ) diff --git a/tests/datasets/test_dataset_writer.py b/tests/datasets/test_dataset_writer.py index 8d2bc0373..8670aeebc 100644 --- a/tests/datasets/test_dataset_writer.py +++ b/tests/datasets/test_dataset_writer.py @@ -25,6 +25,7 @@ from PIL import Image pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])") +from lerobot.configs import VideoEncoderConfig from lerobot.datasets.dataset_writer import _encode_video_worker from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.utils import DEFAULT_IMAGE_PATH @@ -52,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict: # ── Existing encode_video_worker tests ─────────────────────────────── -def test_encode_video_worker_forwards_vcodec(tmp_path): - """_encode_video_worker correctly forwards the vcodec parameter.""" +def test_encode_video_worker_forwards_camera_encoder(tmp_path): + """_encode_video_worker forwards camera_encoder to encode_video_frames.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -68,13 +69,21 @@ def test_encode_video_worker_forwards_vcodec(tmp_path): Path(video_path).touch() with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode): - _encode_video_worker(video_key, 0, tmp_path, fps=30, vcodec="h264") + _encode_video_worker( + video_key, + 0, + tmp_path, + fps=30, + camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None), + encoder_threads=4, + ) - assert captured_kwargs["vcodec"] == "h264" + assert captured_kwargs["camera_encoder"].vcodec == "h264" + assert captured_kwargs["encoder_threads"] == 4 -def test_encode_video_worker_default_vcodec(tmp_path): - """_encode_video_worker uses libsvtav1 as the default codec.""" +def test_encode_video_worker_default_camera_encoder(tmp_path): + """_encode_video_worker passes None camera_encoder which encode_video_frames defaults.""" video_key = "observation.images.laptop" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0) img_dir = tmp_path / Path(fpath).parent @@ -91,7 +100,8 @@ def test_encode_video_worker_default_vcodec(tmp_path): with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode): _encode_video_worker(video_key, 0, tmp_path, fps=30) - assert captured_kwargs["vcodec"] == "libsvtav1" + assert captured_kwargs["camera_encoder"] is None + assert captured_kwargs["encoder_threads"] is None # ── add_frame contracts ────────────────────────────────────────────── diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 654f8cdf1..ba9b64812 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -29,6 +29,7 @@ from PIL import Image from safetensors.torch import load_file from torchvision.transforms import v2 +from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig from lerobot.configs.default import DatasetConfig from lerobot.configs.train import TrainPipelineConfig from lerobot.datasets import make_dataset @@ -43,7 +44,6 @@ from lerobot.datasets.utils import ( DEFAULT_VIDEO_FILE_SIZE_IN_MB, create_branch, ) -from lerobot.datasets.video_utils import VALID_VIDEO_CODECS from lerobot.envs.factory import make_env_config from lerobot.policies.factory import make_policy_config from lerobot.robots import make_robot_from_config @@ -1470,17 +1470,9 @@ def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_fact def test_lerobot_dataset_vcodec_validation(): - """Test that LeRobotDataset validates the vcodec parameter.""" - # Test that invalid vcodec raises ValueError + """Invalid vcodec in encoder config is rejected at construction time.""" with pytest.raises(ValueError, match="Invalid vcodec"): - LeRobotDataset.__new__(LeRobotDataset) # bypass __init__ to test validation directly - # Actually test via create since it's easier - LeRobotDataset.create( - repo_id="test/invalid_codec", - fps=30, - features={"observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}}, - vcodec="invalid_codec", - ) + VideoEncoderConfig(vcodec="invalid_codec") def test_valid_video_codecs_constant(): diff --git a/tests/datasets/test_streaming_video_encoder.py b/tests/datasets/test_streaming_video_encoder.py index 8b7a1540f..b69f24254 100644 --- a/tests/datasets/test_streaming_video_encoder.py +++ b/tests/datasets/test_streaming_video_encoder.py @@ -14,11 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for streaming video encoding and hardware-accelerated encoding.""" +"""Tests for streaming video encoding.""" import queue import threading -from unittest.mock import patch import numpy as np import pytest @@ -27,112 +26,20 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])") import av # noqa: E402 +from lerobot.configs import VideoEncoderConfig +from lerobot.datasets.pyav_utils import get_codec from lerobot.datasets.video_utils import ( - VALID_VIDEO_CODECS, StreamingVideoEncoder, _CameraEncoderThread, - _get_codec_options, - detect_available_hw_encoders, - resolve_vcodec, ) from lerobot.utils.constants import OBS_IMAGES -# ─── _get_codec_options tests ─── - - -class TestGetCodecOptions: - def test_libsvtav1_defaults(self): - opts = _get_codec_options("libsvtav1") - assert opts["g"] == "2" - assert opts["crf"] == "30" - assert opts["preset"] == "12" - - def test_libsvtav1_custom_preset(self): - opts = _get_codec_options("libsvtav1", preset=8) - assert opts["preset"] == "8" - - def test_h264_options(self): - opts = _get_codec_options("h264", g=10, crf=23) - assert opts["g"] == "10" - assert opts["crf"] == "23" - assert "preset" not in opts - - def test_videotoolbox_options(self): - opts = _get_codec_options("h264_videotoolbox", g=2, crf=30) - assert opts["g"] == "2" - # CRF 30 maps to quality = max(1, min(100, 100 - 30*2)) = 40 - assert opts["q:v"] == "40" - assert "crf" not in opts - - def test_nvenc_options(self): - opts = _get_codec_options("h264_nvenc", g=2, crf=25) - assert opts["rc"] == "constqp" - assert opts["qp"] == "25" - assert "crf" not in opts - # NVENC doesn't support g - assert "g" not in opts - - def test_vaapi_options(self): - opts = _get_codec_options("h264_vaapi", crf=28) - assert opts["qp"] == "28" - - def test_qsv_options(self): - opts = _get_codec_options("h264_qsv", crf=25) - assert opts["global_quality"] == "25" - - def test_no_g_no_crf(self): - opts = _get_codec_options("h264", g=None, crf=None) - assert "g" not in opts - assert "crf" not in opts - - -# ─── HW encoder detection tests ─── - - -class TestHWEncoderDetection: - def test_detect_available_hw_encoders_returns_list(self): - result = detect_available_hw_encoders() - assert isinstance(result, list) - - def test_detect_available_hw_encoders_only_valid(self): - from lerobot.datasets.video_utils import HW_ENCODERS - - result = detect_available_hw_encoders() - for encoder in result: - assert encoder in HW_ENCODERS - - def test_resolve_vcodec_passthrough(self): - assert resolve_vcodec("libsvtav1") == "libsvtav1" - assert resolve_vcodec("h264") == "h264" - - def test_resolve_vcodec_auto_fallback(self): - """When no HW encoders are available, auto should fall back to libsvtav1.""" - with patch("lerobot.datasets.video_utils.detect_available_hw_encoders", return_value=[]): - assert resolve_vcodec("auto") == "libsvtav1" - - def test_resolve_vcodec_auto_picks_hw(self): - """When a HW encoder is available, auto should pick it.""" - with patch( - "lerobot.datasets.video_utils.detect_available_hw_encoders", - return_value=["h264_videotoolbox"], - ): - assert resolve_vcodec("auto") == "h264_videotoolbox" - - def test_resolve_vcodec_auto_returns_valid(self): - """Test that resolve_vcodec('auto') returns a known valid codec.""" - result = resolve_vcodec("auto") - assert result in VALID_VIDEO_CODECS - - def test_hw_encoder_names_accepted_in_validation(self): - """Test that HW encoder names pass validation in VALID_VIDEO_CODECS.""" - assert "auto" in VALID_VIDEO_CODECS - assert "h264_videotoolbox" in VALID_VIDEO_CODECS - assert "h264_nvenc" in VALID_VIDEO_CODECS - - def test_resolve_vcodec_invalid_raises(self): - """Test that resolve_vcodec raises ValueError for invalid codecs.""" - with pytest.raises(ValueError, match="Invalid vcodec"): - resolve_vcodec("not_a_real_codec") +# Cross-codec validation tests only fire when the target codec is present +# in the local FFmpeg build; on other platforms validate() is a no-op. +_has_videotoolbox = get_codec("h264_videotoolbox") is not None +_videotoolbox_only = pytest.mark.skipif( + not _has_videotoolbox, reason="h264_videotoolbox not in local FFmpeg build" +) # ─── _CameraEncoderThread tests ─── @@ -150,14 +57,13 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() + enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec="libsvtav1", - pix_fmt="yuv420p", - g=2, - crf=30, - preset=13, + vcodec=enc_cfg.vcodec, + pix_fmt=enc_cfg.pix_fmt, + codec_options=enc_cfg.get_codec_options(as_strings=True), frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -202,14 +108,13 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() + enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec="libsvtav1", - pix_fmt="yuv420p", - g=2, - crf=30, - preset=13, + vcodec=enc_cfg.vcodec, + pix_fmt=enc_cfg.pix_fmt, + codec_options=enc_cfg.get_codec_options(as_strings=True), frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -237,14 +142,13 @@ class TestCameraEncoderThread: result_queue: queue.Queue = queue.Queue(maxsize=1) stop_event = threading.Event() + enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) encoder_thread = _CameraEncoderThread( video_path=video_path, fps=fps, - vcodec="libsvtav1", - pix_fmt="yuv420p", - g=2, - crf=30, - preset=13, + vcodec=enc_cfg.vcodec, + pix_fmt=enc_cfg.pix_fmt, + codec_options=enc_cfg.get_codec_options(as_strings=True), frame_queue=frame_queue, result_queue=result_queue, stop_event=stop_event, @@ -266,11 +170,20 @@ class TestCameraEncoderThread: class TestStreamingVideoEncoder: + def _make_encoder_config(self, **kwargs): + """Helper to build a VideoEncoderConfig.""" + return VideoEncoderConfig(**kwargs) + def test_single_camera_episode(self, tmp_path): """Test encoding a single camera episode.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) - video_keys = [f"{OBS_IMAGES}.laptop"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config( + vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 + ), + ) + encoder.start_episode(video_keys, tmp_path) num_frames = 20 @@ -295,9 +208,11 @@ class TestStreamingVideoEncoder: def test_multi_camera_episode(self, tmp_path): """Test encoding multiple cameras simultaneously.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30) - video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + ) encoder.start_episode(video_keys, tmp_path) num_frames = 15 @@ -319,8 +234,11 @@ class TestStreamingVideoEncoder: def test_sequential_episodes(self, tmp_path): """Test that multiple sequential episodes work correctly.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30) video_keys = [f"{OBS_IMAGES}.cam"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + ) for ep in range(3): encoder.start_episode(video_keys, tmp_path) @@ -342,8 +260,11 @@ class TestStreamingVideoEncoder: def test_cancel_episode(self, tmp_path): """Test that canceling an episode cleans up properly.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30) video_keys = [f"{OBS_IMAGES}.cam"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + ) encoder.start_episode(video_keys, tmp_path) @@ -365,28 +286,33 @@ class TestStreamingVideoEncoder: def test_feed_without_start_raises(self, tmp_path): """Test that feeding frames without starting an episode raises.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p") + encoder = StreamingVideoEncoder(fps=30) with pytest.raises(RuntimeError, match="No active episode"): encoder.feed_frame("cam", np.zeros((64, 96, 3), dtype=np.uint8)) encoder.close() def test_finish_without_start_raises(self, tmp_path): """Test that finishing without starting raises.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p") + encoder = StreamingVideoEncoder(fps=30) with pytest.raises(RuntimeError, match="No active episode"): encoder.finish_episode() encoder.close() def test_close_is_idempotent(self, tmp_path): """Test that close() can be called multiple times safely.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p") + encoder = StreamingVideoEncoder(fps=30) encoder.close() encoder.close() # Should not raise def test_video_duration_matches_frame_count(self, tmp_path): """Test that encoded video duration matches num_frames / fps.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13) video_keys = [f"{OBS_IMAGES}.cam"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config( + vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 + ), + ) encoder.start_episode(video_keys, tmp_path) num_frames = 90 # 3 seconds at 30fps @@ -417,9 +343,11 @@ class TestStreamingVideoEncoder: def test_multi_camera_start_episode_called_once(self, tmp_path): """Test that with multiple cameras, no frames are lost due to double start_episode.""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30) - video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), + ) encoder.start_episode(video_keys, tmp_path) num_frames = 30 @@ -446,17 +374,24 @@ class TestStreamingVideoEncoder: def test_encoder_threads_passed_to_thread(self, tmp_path): """Test that encoder_threads is stored and passed through to encoder threads.""" - encoder = StreamingVideoEncoder( - fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, encoder_threads=2 - ) - assert encoder.encoder_threads == 2 - video_keys = [f"{OBS_IMAGES}.cam"] + cfg = VideoEncoderConfig( + vcodec="libsvtav1", + pix_fmt="yuv420p", + g=2, + crf=30, + ) + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=cfg, + encoder_threads=2, + ) + assert encoder._encoder_threads == 2 encoder.start_episode(video_keys, tmp_path) - # Verify the thread received the encoder_threads value + # Verify codec options include thread tuning for libsvtav1 (lp=…) thread = encoder._threads[f"{OBS_IMAGES}.cam"] - assert thread.encoder_threads == 2 + assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options # Feed some frames and finish to ensure it works end-to-end num_frames = 10 @@ -478,16 +413,20 @@ class TestStreamingVideoEncoder: def test_encoder_threads_none_by_default(self, tmp_path): """Test that encoder_threads defaults to None (codec auto-detect).""" - encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p") - assert encoder.encoder_threads is None + encoder = StreamingVideoEncoder(fps=30) + assert encoder._encoder_threads is None encoder.close() def test_graceful_frame_dropping(self, tmp_path): """Test that full queue drops frames instead of crashing.""" - encoder = StreamingVideoEncoder( - fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13, queue_maxsize=1 - ) video_keys = [f"{OBS_IMAGES}.cam"] + encoder = StreamingVideoEncoder( + fps=30, + camera_encoder=self._make_encoder_config( + vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13 + ), + queue_maxsize=1, + ) encoder.start_episode(video_keys, tmp_path) # Feed many frames quickly - with queue_maxsize=1, some will be dropped diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py new file mode 100644 index 000000000..224f2405b --- /dev/null +++ b/tests/datasets/test_video_encoding.py @@ -0,0 +1,595 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``lerobot.datasets.video_utils`` encoding functions and ``lerobot.configs.video.VideoEncoderConfig`` config class.""" + +import json +from pathlib import Path + +import numpy as np +import pytest + +pytest.importorskip("av", reason="av is required (install lerobot[dataset])") + +import av # noqa: E402 + +from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig +from lerobot.datasets.image_writer import write_image +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.pyav_utils import get_codec +from lerobot.datasets.utils import INFO_PATH +from lerobot.datasets.video_utils import ( + concatenate_video_files, + encode_video_frames, + get_video_info, +) +from tests.fixtures.constants import DUMMY_VIDEO_INFO + + +# Per-codec skip markers — validation tests only fire when the codec is available +def _require_encoder(vcodec: str) -> pytest.MarkDecorator: + """Skip the test if ``vcodec`` is not available in the local FFmpeg build.""" + return pytest.mark.skipif(get_codec(vcodec) is None, reason=f"{vcodec!r} not in local FFmpeg build") + + +require_libsvtav1 = _require_encoder("libsvtav1") +require_h264 = _require_encoder("h264") +require_videotoolbox = _require_encoder("h264_videotoolbox") +require_nvenc = _require_encoder("h264_nvenc") +require_vaapi = _require_encoder("h264_vaapi") +require_qsv = _require_encoder("h264_qsv") + + +# ─── VideoEncoderConfig / codec options ────────────────────────────── + + +class TestCodecOptions: + @require_libsvtav1 + def test_libsvtav1_defaults(self): + cfg = VideoEncoderConfig() + opts = cfg.get_codec_options() + assert opts["g"] == 2 + assert opts["crf"] == 30 + assert opts["preset"] == 12 + + @require_libsvtav1 + def test_libsvtav1_custom_preset(self): + cfg = VideoEncoderConfig(preset=8) + assert cfg.get_codec_options()["preset"] == 8 + + @require_h264 + def test_h264_options(self): + cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None) + opts = cfg.get_codec_options() + assert opts["g"] == 10 + assert opts["crf"] == 23 + assert "preset" not in opts + + @require_videotoolbox + def test_videotoolbox_options(self): + cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None) + opts = cfg.get_codec_options() + assert opts["g"] == 2 + assert opts["q:v"] == 40 + assert "crf" not in opts + + @_require_encoder("h264_nvenc") + def test_nvenc_options(self): + cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None) + opts = cfg.get_codec_options() + assert opts["rc"] == 0 + assert opts["qp"] == 25 + assert "crf" not in opts + assert opts["g"] == 2 + + @_require_encoder("h264_vaapi") + def test_vaapi_options(self): + cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None) + assert cfg.get_codec_options()["qp"] == 28 + + @_require_encoder("h264_qsv") + def test_qsv_options(self): + cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None) + assert cfg.get_codec_options()["global_quality"] == 25 + + @require_h264 + def test_no_g_no_crf(self): + cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None) + opts = cfg.get_codec_options() + assert "g" not in opts + assert "crf" not in opts + + @require_libsvtav1 + def test_encoder_threads_libsvtav1(self): + cfg = VideoEncoderConfig(fast_decode=0) + opts = cfg.get_codec_options(encoder_threads=4) + assert "lp=4" in opts.get("svtav1-params", "") + + @require_h264 + def test_encoder_threads_h264(self): + cfg = VideoEncoderConfig(vcodec="h264", preset=None) + assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2 + + @require_libsvtav1 + def test_fast_decode_libsvtav1(self): + cfg = VideoEncoderConfig(fast_decode=1) + opts = cfg.get_codec_options() + assert "fast-decode=1" in opts.get("svtav1-params", "") + + @require_libsvtav1 + def test_libsvtav1_fast_decode_clamped_to_svt_range(self): + """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode).""" + cfg = VideoEncoderConfig(fast_decode=100) + assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "") + cfg_neg = VideoEncoderConfig(fast_decode=-5) + assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "") + + @require_h264 + def test_fast_decode_h264(self): + cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None) + assert cfg.get_codec_options()["tune"] == "fastdecode" + + @require_libsvtav1 + def test_pix_fmt_unsupported_raises(self): + """Passing an unsupported pix_fmt is a hard error.""" + with pytest.raises(ValueError, match="pix_fmt"): + VideoEncoderConfig(pix_fmt="yuv444p") # libsvtav1 only supports yuv420p variants + + @require_libsvtav1 + @require_h264 + def test_preset_default_behaviour(self): + """Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None.""" + assert VideoEncoderConfig().preset == 12 + assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12 + assert VideoEncoderConfig(vcodec="h264").preset is None + assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None + + @require_h264 + def test_preset_string_on_h264(self): + """h264 accepts string presets and forwards them to FFmpeg.""" + cfg = VideoEncoderConfig(vcodec="h264", preset="slow") + assert cfg.get_codec_options()["preset"] == "slow" + + @require_videotoolbox + def test_preset_on_videotoolbox_not_set(self): + """videotoolbox has no preset option at all.""" + cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow") + assert "preset" not in cfg.get_codec_options() + + @require_libsvtav1 + def test_libsvtav1_preset_out_of_range_raises(self): + """libsvtav1 preset must sit in [-2, 13] as exposed by PyAV.""" + with pytest.raises(ValueError, match="out of range"): + VideoEncoderConfig(vcodec="libsvtav1", preset=100) + with pytest.raises(ValueError, match="out of range"): + VideoEncoderConfig(vcodec="libsvtav1", preset=-3) + + @require_libsvtav1 + def test_libsvtav1_crf_out_of_range_raises(self): + """libsvtav1 crf must sit in [0, 63].""" + with pytest.raises(ValueError, match="crf.*out of range"): + VideoEncoderConfig(vcodec="libsvtav1", crf=64) + + @require_libsvtav1 + def test_libsvtav1_crf_rejects_python_float(self): + """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation.""" + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig(vcodec="libsvtav1", crf=2.5) + + @require_libsvtav1 + def test_libsvtav1_extra_crf_rejects_fractional_string(self): + """INT options reject fractional values even when supplied only via ``extra_options``.""" + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig( + vcodec="libsvtav1", + crf=None, + extra_options={"crf": "2.5"}, + ) + + @require_libsvtav1 + def test_libsvtav1_extra_crf_rejects_float(self): + with pytest.raises(ValueError, match="float values are not allowed"): + VideoEncoderConfig( + vcodec="libsvtav1", + crf=None, + extra_options={"crf": 2.5}, + ) + + @require_h264 + def test_h264_crf_accepts_float_and_int(self): + """x264 exposes crf as a FLOAT option, so both int and float are accepted.""" + assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23 + assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5 + + @require_libsvtav1 + def test_validate_is_rerunnable(self): + """After mutating a field, validate() re-checks and surfaces new issues.""" + cfg = VideoEncoderConfig(vcodec="libsvtav1") + cfg.preset = 100 # now out of range + with pytest.raises(ValueError, match="out of range"): + cfg.validate() + + +class TestExtraOptions: + @require_libsvtav1 + def test_default_is_empty_dict(self): + cfg = VideoEncoderConfig() + assert cfg.extra_options == {} + + @require_libsvtav1 + def test_unknown_key_passes_through(self): + """Keys not published as AVOptions are forwarded to FFmpeg.""" + cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"}) + assert cfg.extra_options == {"totally_made_up_option": "value"} + + @require_libsvtav1 + def test_numeric_value_in_range_ok(self): + """libsvtav1 exposes ``qp`` as INT in [0, 63].""" + cfg = VideoEncoderConfig(extra_options={"qp": 30}) + assert cfg.extra_options == {"qp": 30} + + @require_libsvtav1 + def test_numeric_out_of_range_raises(self): + with pytest.raises(ValueError, match=r"qp=.*out of range"): + VideoEncoderConfig(extra_options={"qp": 999}) + + @require_libsvtav1 + def test_numeric_string_accepted_in_range(self): + """Numeric strings are accepted for numeric options (mirrors FFmpeg).""" + cfg = VideoEncoderConfig(extra_options={"qp": "18"}) + assert cfg.extra_options == {"qp": "18"} + + @require_libsvtav1 + def test_numeric_string_out_of_range_raises(self): + with pytest.raises(ValueError, match=r"qp=.*out of range"): + VideoEncoderConfig(extra_options={"qp": "999"}) + + @require_libsvtav1 + def test_non_numeric_string_on_numeric_option_raises(self): + with pytest.raises(ValueError, match=r"qp=.*not numeric"): + VideoEncoderConfig(extra_options={"qp": "medium"}) + + @require_libsvtav1 + def test_bool_on_numeric_option_raises(self): + """``bool`` is explicitly rejected for numeric options.""" + with pytest.raises(ValueError, match=r"qp=.*not numeric"): + VideoEncoderConfig(extra_options={"qp": True}) + + @require_h264 + def test_string_option_passes_through_unchecked(self): + """String-typed AVOptions are NOT enum-checked (too many accept freeform).""" + cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"}) + assert cfg.extra_options == {"tune": "some-future-tune"} + + @require_libsvtav1 + def test_merged_into_codec_options_and_stringified(self): + """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict.""" + cfg = VideoEncoderConfig(extra_options={"qp": 20}) + opts = cfg.get_codec_options() + assert opts["qp"] == 20 + assert isinstance(opts["qp"], int) + assert cfg.get_codec_options(as_strings=True)["qp"] == "20" + + @require_libsvtav1 + def test_structured_fields_win_on_collision(self): + """A colliding extra_options key is discarded; the structured field wins.""" + cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18}) + assert cfg.get_codec_options()["crf"] == 30 + + +class TestEncoderDetection: + @require_h264 + def test_explicit_codec_kept_when_available(self): + cfg = VideoEncoderConfig(vcodec="h264") + assert cfg.vcodec == "h264" + + @require_videotoolbox + def test_auto_picks_videotoolbox_when_available(self): + """``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present.""" + cfg = VideoEncoderConfig(vcodec="auto") + assert cfg.vcodec == "h264_videotoolbox" + + def test_invalid_codec_raises(self): + with pytest.raises(ValueError, match="Invalid vcodec"): + VideoEncoderConfig(vcodec="not_a_real_codec") + + def test_hw_encoder_names_listed_as_valid(self): + assert "auto" in VALID_VIDEO_CODECS + assert "h264_videotoolbox" in VALID_VIDEO_CODECS + assert "h264_nvenc" in VALID_VIDEO_CODECS + + +TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos" + +# Default video feature set used by persistence tests. +VIDEO_FEATURES = { + "observation.images.cam": { + "dtype": "video", + "shape": (64, 96, 3), + "names": ["height", "width", "channels"], + }, + "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}, +} +VIDEO_KEY = "observation.images.cam" + + +def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None: + imgs_dir.mkdir(parents=True, exist_ok=True) + for i in range(num_frames): + arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) + write_image(arr, imgs_dir / f"frame-{i:06d}.png") + + +def _encode_video( + path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None +) -> Path: + imgs_dir = path.parent / f"imgs_{path.stem}" + _write_frames(imgs_dir, num_frames=num_frames) + encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True) + return path + + +def _read_feature_info(dataset: LeRobotDataset) -> dict: + info = json.loads((dataset.root / INFO_PATH).read_text()) + return info["features"][VIDEO_KEY]["info"] + + +def _add_frames(dataset: LeRobotDataset, num_frames: int) -> None: + shape = dataset.meta.features[VIDEO_KEY]["shape"] + for _ in range(num_frames): + dataset.add_frame( + { + VIDEO_KEY: np.random.randint(0, 256, shape, dtype=np.uint8), + "action": np.zeros(2, dtype=np.float32), + "task": "test", + } + ) + + +class TestGetVideoInfo: + def test_returns_all_stream_fields(self): + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4") + + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.pix_fmt"] == "yuv420p" + assert info["video.fps"] == 30 + assert info["video.channels"] == 3 + assert info["video.is_depth_map"] is False + assert info["has_audio"] is False + assert "video.g" not in info + assert "video.crf" not in info + assert "video.preset" not in info + + @require_libsvtav1 + def test_merges_encoder_config_as_video_prefixed_entries(self): + cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg) + + assert info["video.g"] == 2 + assert info["video.crf"] == 30 + assert info["video.preset"] == 12 + assert info["video.fast_decode"] == 0 + assert info["video.video_backend"] == "pyav" + assert info["video.extra_options"] == {} + + @require_libsvtav1 + def test_stream_derived_keys_take_precedence_over_config(self): + cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p") + + info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg) + + assert info["video.codec"] # populated from stream, not from config's vcodec + assert info["video.pix_fmt"] == "yuv420p" + + +class TestEncodeVideoFrames: + @require_libsvtav1 + def test_produces_readable_mp4(self, tmp_path): + video_path = _encode_video(tmp_path / "out.mp4") + + assert video_path.exists() + info = get_video_info(video_path) + assert info["video.height"] == 64 + assert info["video.width"] == 96 + + @require_libsvtav1 + def test_frame_count_and_duration_match_input(self, tmp_path): + num_frames = 10 + fps = 30 + video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps) + + with av.open(str(video_path)) as container: + stream = container.streams.video[0] + actual_frames = sum(1 for _ in container.decode(stream)) + duration = ( + float(stream.duration * stream.time_base) + if stream.duration is not None + else float(container.duration / av.time_base) + ) + + assert actual_frames == num_frames + assert abs(duration - num_frames / fps) < 0.1 + + def test_overwrite_false_skips_existing_file(self, tmp_path): + imgs_dir = tmp_path / "imgs" + _write_frames(imgs_dir) + video_path = tmp_path / "out.mp4" + sentinel = b"pre-existing content" + video_path.write_bytes(sentinel) + + encode_video_frames(imgs_dir, video_path, fps=30, overwrite=False) + + assert video_path.read_bytes() == sentinel + + @require_libsvtav1 + def test_overwrite_true_replaces_existing_file(self, tmp_path): + imgs_dir = tmp_path / "imgs" + _write_frames(imgs_dir) + video_path = tmp_path / "out.mp4" + video_path.write_bytes(b"stale content") + + encode_video_frames(imgs_dir, video_path, fps=30, overwrite=True) + + info = get_video_info(video_path) + assert info["video.height"] == 64 + + @require_libsvtav1 + def test_custom_encoder_config_fields_stored_in_info(self, tmp_path): + """All stream-derived and encoder config fields are present after encoding.""" + cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10) + video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg) + + info = get_video_info(video_path, camera_encoder=cfg) + + # Stream-derived + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.channels"] == 3 + assert info["video.codec"] == "av1" + assert info["video.pix_fmt"] == "yuv420p" + assert info["video.fps"] == 30 + assert info["video.is_depth_map"] is False + assert info["has_audio"] is False + # Encoder config + assert info["video.g"] == 4 + assert info["video.crf"] == 25 + assert info["video.preset"] == 10 + assert info["video.fast_decode"] == 0 + assert info["video.video_backend"] == "pyav" + assert info["video.extra_options"] == {} + + +class TestConcatenateVideoFiles: + def test_two_clips_frame_count(self, tmp_path): + """Output frame count equals the sum of the two input frame counts.""" + out = tmp_path / "out.mp4" + concatenate_video_files( + [TEST_ARTIFACTS_DIR / "clip_6frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out + ) + + with av.open(str(out)) as container: + total = sum(1 for _ in container.decode(video=0)) + assert total == 10 + + def test_three_clips_frame_count(self, tmp_path): + out = tmp_path / "out.mp4" + clip = TEST_ARTIFACTS_DIR / "clip_5frames.mp4" + concatenate_video_files([clip, clip, clip], out) + + with av.open(str(out)) as container: + total = sum(1 for _ in container.decode(video=0)) + assert total == 15 + + @require_libsvtav1 + def test_geometry_preserved(self, tmp_path): + """Output resolution, fps, codec and pixel format must match the inputs.""" + out = tmp_path / "out.mp4" + concatenate_video_files( + [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out + ) + + info = get_video_info(out) + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.fps"] == 30 + assert info["video.codec"] == "av1" + assert info["video.pix_fmt"] == "yuv420p" + + def test_compatibility_check_raises_on_different_codec(self, tmp_path): + with pytest.raises(ValueError): + concatenate_video_files( + [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_h264.mp4"], + tmp_path / "out.mp4", + compatibility_check=True, + ) + + def test_compatibility_check_raises_on_different_resolution(self, tmp_path): + with pytest.raises(ValueError): + concatenate_video_files( + [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_32x48.mp4"], + tmp_path / "out.mp4", + compatibility_check=True, + ) + + +class TestEncoderConfigPersistence: + """Encoder config must be stored as ``video.`` entries in + ``info["features"][key]["info"]`` when the first episode is saved. + """ + + @require_libsvtav1 + def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory): + cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg + ) + + _add_frames(dataset, num_frames=4) + dataset.save_episode() + dataset.finalize() + + info = _read_feature_info(dataset) + + assert info["video.height"] == 64 + assert info["video.width"] == 96 + assert info["video.fps"] == 30 + assert info["video.g"] == 2 + assert info["video.crf"] == 30 + assert info["video.preset"] == 12 + assert info["video.fast_decode"] == 0 + assert info["video.video_backend"] == "pyav" + assert info["video.extra_options"] == {} + + @require_libsvtav1 + def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory): + cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12) + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg + ) + + _add_frames(dataset, num_frames=4) + dataset.save_episode() + first_info = dict(_read_feature_info(dataset)) + + _add_frames(dataset, num_frames=4) + dataset.save_episode() + dataset.finalize() + + assert _read_feature_info(dataset) == first_info + + +class TestFromVideoInfo: + """``VideoEncoderConfig.from_video_info`` reconstructs an encoder config + from the ``video.*`` keys persisted in a dataset's ``info.json``. + """ + + @require_libsvtav1 + def test_reconstructs_from_dummy_video_info(self): + cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO) + + # Canonical stream codec ``"av1"`` is aliased to the encoder name. + assert cfg.vcodec == "libsvtav1" + assert cfg.pix_fmt == DUMMY_VIDEO_INFO["video.pix_fmt"] + assert cfg.g == DUMMY_VIDEO_INFO["video.g"] + assert cfg.crf == DUMMY_VIDEO_INFO["video.crf"] + assert cfg.preset == DUMMY_VIDEO_INFO["video.preset"] + assert cfg.fast_decode == DUMMY_VIDEO_INFO["video.fast_decode"] + assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"] + # ``{}`` placeholder (typical after a merge with disagreeing sources) + # must not leak into the reconstructed config. + assert cfg.extra_options == VideoEncoderConfig().extra_options diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py index 35d8776ce..4d578b503 100644 --- a/tests/fixtures/constants.py +++ b/tests/fixtures/constants.py @@ -28,17 +28,23 @@ DUMMY_MOTOR_FEATURES = { "names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"], }, } -DUMMY_CAMERA_FEATURES = { - "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None}, - "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None}, -} DEFAULT_FPS = 30 DUMMY_VIDEO_INFO = { "video.fps": DEFAULT_FPS, "video.codec": "av1", "video.pix_fmt": "yuv420p", + "video.video_backend": "pyav", + "video.extra_options": {}, + "video.g": 2, + "video.crf": 30, + "video.preset": 12, + "video.fast_decode": 0, "video.is_depth_map": False, "has_audio": False, } +DUMMY_CAMERA_FEATURES = { + "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO}, + "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO}, +} DUMMY_CHW = (3, 96, 128) DUMMY_HWC = (96, 128, 3) diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py index 48128a8d0..a6e349778 100644 --- a/tests/fixtures/dataset_factories.py +++ b/tests/fixtures/dataset_factories.py @@ -46,7 +46,6 @@ from tests.fixtures.constants import ( DUMMY_MOTOR_FEATURES, DUMMY_REPO_ID, DUMMY_ROBOT_TYPE, - DUMMY_VIDEO_INFO, ) @@ -134,9 +133,7 @@ def features_factory(): use_videos: bool = True, ) -> dict: if use_videos: - camera_ft = { - key: {"dtype": "video", **ft, **DUMMY_VIDEO_INFO} for key, ft in camera_features.items() - } + camera_ft = {key: {"dtype": "video", **ft} for key, ft in camera_features.items()} else: camera_ft = {key: {"dtype": "image", **ft} for key, ft in camera_features.items()} return {