mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 22:59:50 +00:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 18ef0c270b | |||
| 58cd98c0d3 | |||
| a3c670b987 | |||
| 8cd74ea8b8 | |||
| be1180b240 | |||
| fff6bc1a93 | |||
| 141304ac78 | |||
| 9b3c752b64 | |||
| 3dc73551dd | |||
| 237bae51e8 | |||
| df8b33fc68 | |||
| 50e2d7b5f4 | |||
| 016799dfa1 | |||
| 51b9038458 | |||
| cc9a2e5c99 | |||
| a2376389f9 | |||
| 57a619ab02 | |||
| 7f624adcc5 | |||
| 375cf1fdf3 | |||
| b2c2bb7641 | |||
| 4a87ee1537 | |||
| e44f86e516 | |||
| a0e3acdb67 | |||
| 38ff579bcc | |||
| 479e444517 | |||
| 9787b8fa26 | |||
| 71f39f6912 |
@@ -39,6 +39,7 @@ from tqdm import tqdm
|
||||
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.video_utils import (
|
||||
VideoEncoderConfig,
|
||||
decode_video_frames,
|
||||
encode_video_frames,
|
||||
)
|
||||
@@ -251,10 +252,13 @@ def benchmark_encoding_decoding(
|
||||
imgs_dir=imgs_dir,
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
vcodec=encoding_cfg["vcodec"],
|
||||
pix_fmt=encoding_cfg["pix_fmt"],
|
||||
g=encoding_cfg.get("g"),
|
||||
crf=encoding_cfg.get("crf"),
|
||||
camera_encoder_config=VideoEncoderConfig(
|
||||
vcodec=encoding_cfg["vcodec"],
|
||||
pix_fmt=encoding_cfg["pix_fmt"],
|
||||
g=encoding_cfg.get("g"),
|
||||
crf=encoding_cfg.get("crf"),
|
||||
preset=encoding_cfg.get("preset"),
|
||||
),
|
||||
# fast_decode=encoding_cfg.get("fastdecode"),
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
@@ -33,6 +33,8 @@
|
||||
title: Using the Dataset Tools
|
||||
- local: dataset_subtask
|
||||
title: Using Subtasks in the Dataset
|
||||
- local: video_encoding_parameters
|
||||
title: Video encoding parameters
|
||||
- local: streaming_video_encoding
|
||||
title: Streaming Video Encoding
|
||||
title: "Datasets"
|
||||
|
||||
+1
-1
@@ -90,6 +90,6 @@ lerobot-record \
|
||||
--dataset.single_task="Your task description" \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--policy.path=${HF_USER}/act_policy
|
||||
```
|
||||
|
||||
@@ -194,7 +194,7 @@ lerobot-record \
|
||||
--dataset.single_task="Navigate around obstacles" \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ lerobot-record \
|
||||
--dataset.single_task="Grab and handover the red cube to the other arm" \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--policy.path=<user>/groot-bimanual \ # your trained model
|
||||
--dataset.episode_time_s=30 \
|
||||
--dataset.reset_time_s=10
|
||||
|
||||
@@ -232,7 +232,7 @@ lerobot-record \
|
||||
--dataset.private=true \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
@@ -278,6 +278,6 @@ lerobot-record \
|
||||
--dataset.num_episodes=10 \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
|
||||
```
|
||||
|
||||
@@ -193,7 +193,7 @@ lerobot-record \
|
||||
--dataset.num_episodes=5 \
|
||||
--dataset.single_task="Grab the black cube" \
|
||||
--dataset.streaming_encoding=true \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--dataset.encoder_threads=2
|
||||
```
|
||||
</hfoption>
|
||||
|
||||
@@ -43,7 +43,7 @@ lerobot-record \
|
||||
--dataset.num_episodes=5 \
|
||||
--dataset.single_task="Grab the black cube" \
|
||||
--dataset.streaming_encoding=true \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--dataset.encoder_threads=2
|
||||
```
|
||||
|
||||
|
||||
@@ -161,7 +161,7 @@ lerobot-record \
|
||||
--dataset.private=true \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
@@ -203,7 +203,7 @@ lerobot-record \
|
||||
--dataset.private=true \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
|
||||
@@ -108,7 +108,7 @@ lerobot-record \
|
||||
--dataset.num_episodes=10 \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
# --dataset.vcodec=auto \
|
||||
# --dataset.camera_encoder_config.vcodec=auto \
|
||||
# <- Teleop optional if you want to teleoperate in between episodes \
|
||||
# --teleop.type=so100_leader \
|
||||
# --teleop.port=/dev/ttyACM0 \
|
||||
|
||||
@@ -14,12 +14,12 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti
|
||||
|
||||
## 2. Tuning Parameters
|
||||
|
||||
| Parameter | CLI Flag | Type | Default | Description |
|
||||
| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
|
||||
| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture |
|
||||
| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
|
||||
| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
|
||||
| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM |
|
||||
| Parameter | CLI Flag | Type | Default | Description |
|
||||
| ----------------------- | ---------------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
|
||||
| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture |
|
||||
| `vcodec` | `--dataset.camera_encoder_config.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
|
||||
| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
|
||||
| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM |
|
||||
|
||||
## 3. Performance Considerations
|
||||
|
||||
@@ -48,7 +48,7 @@ This parameter controls how many threads each encoder instance uses internally:
|
||||
|
||||
### Backpressure and Frame Dropping
|
||||
|
||||
Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
|
||||
Each camera has a bounded queue (`encoder_queue_maxsize`, default 30 frames). When the encoder can't keep up:
|
||||
|
||||
1. The queue fills up (consuming RAM)
|
||||
2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
|
||||
@@ -82,15 +82,15 @@ Use HW encoding when:
|
||||
|
||||
### Available HW Encoders
|
||||
|
||||
| Encoder | Platform | Hardware | CLI Value |
|
||||
| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
|
||||
| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` |
|
||||
| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` |
|
||||
| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` |
|
||||
| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` |
|
||||
| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` |
|
||||
| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` |
|
||||
| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` |
|
||||
| Encoder | Platform | Hardware | CLI Value |
|
||||
| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ---------------------------------------------------------- |
|
||||
| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder_config.vcodec=h264_videotoolbox` |
|
||||
| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder_config.vcodec=hevc_videotoolbox` |
|
||||
| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder_config.vcodec=h264_nvenc` |
|
||||
| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder_config.vcodec=hevc_nvenc` |
|
||||
| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder_config.vcodec=h264_vaapi` |
|
||||
| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder_config.vcodec=h264_qsv` |
|
||||
| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder_config.vcodec=auto` |
|
||||
|
||||
> [!NOTE]
|
||||
> In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
|
||||
@@ -100,15 +100,15 @@ Use HW encoding when:
|
||||
|
||||
## 5. Troubleshooting
|
||||
|
||||
| Symptom | Likely Cause | Fix |
|
||||
| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
|
||||
| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). |
|
||||
| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
|
||||
| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
|
||||
| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
|
||||
| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` |
|
||||
| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
|
||||
| Symptom | Likely Cause | Fix |
|
||||
| ------------------------------------------------------------------ | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder_config.vcodec=auto`) |
|
||||
| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder_config.vcodec=auto`). |
|
||||
| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
|
||||
| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
|
||||
| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
|
||||
| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder_config.vcodec=auto` |
|
||||
| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
|
||||
|
||||
## 6. Recommended Configurations
|
||||
|
||||
@@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the
|
||||
# 2camsx 640x480x3 @30fps: Requires some tuning.
|
||||
|
||||
# Use H.264, disable streaming, consider batching encoding
|
||||
lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
|
||||
lerobot-record --dataset.camera_encoder_config.vcodec=h264 --dataset.streaming_encoding=false ...
|
||||
```
|
||||
|
||||
## 7. Closing note
|
||||
|
||||
@@ -117,10 +117,10 @@ lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht_image \
|
||||
--operation.type convert_image_to_video \
|
||||
--operation.output_dir outputs/pusht_video \
|
||||
--operation.vcodec libsvtav1 \
|
||||
--operation.pix_fmt yuv420p \
|
||||
--operation.g 2 \
|
||||
--operation.crf 30
|
||||
--operation.camera_encoder_config.vcodec libsvtav1 \
|
||||
--operation.camera_encoder_config.pix_fmt yuv420p \
|
||||
--operation.camera_encoder_config.g 2 \
|
||||
--operation.camera_encoder_config.crf 30
|
||||
|
||||
# Convert only specific episodes
|
||||
lerobot-edit-dataset \
|
||||
@@ -147,11 +147,7 @@ lerobot-edit-dataset \
|
||||
**Parameters:**
|
||||
|
||||
- `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`)
|
||||
- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`)
|
||||
- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`)
|
||||
- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2)
|
||||
- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30)
|
||||
- `fast_decode`: Fast decode tuning option (default: 0)
|
||||
- `camera_encoder_config`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder_config.<field>. See [Video Encoding Parameters](./video_encoding_parameters) for more details.
|
||||
- `episode_indices`: List of specific episodes to convert (default: all episodes)
|
||||
- `num_workers`: Number of parallel workers for processing (default: 4)
|
||||
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
# Video encoding parameters
|
||||
|
||||
When **video storage** is on, LeRobot stores each camera stream as an **MP4** file rather than saving **every timestep as its own image file**. **Video encoding compress across time**, which usually cuts **dataset size and I/O** compared to heaps of PNGs, and MP4 stays a **familiar format** for players and loaders. Incoding frames into a MP4 file is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs speed, and
|
||||
optional extra encoder flags. **Many of those knobs are user-tunable** and are exposed on the dataset config as
|
||||
**`dataset.camera_encoder_config`** — a nested **`VideoEncoderConfig`** (`lerobot.datasets.video_utils.
|
||||
VideoEncoderConfig`) passed through **PyAV**.
|
||||
|
||||
You can set these parameters from the CLI with **`--dataset.camera_encoder_config.<field>`** (e.g. `lerobot-record`, `lerobot-rollout`). The same block applies to **every** camera video stream in that run. **Video storage must be on** — **`use_videos=True`** in Python APIs or **`--dataset.video=true`** (recording default); with video off, inputs stay as images and **`camera_encoder_config` is ignored.**
|
||||
|
||||
For **when** frames are written vs encoded (streaming vs post-episode), queues, and other top-level **`--dataset.*`** switches, see [Streaming Video Encoding](./streaming_video_encoding). For codec/size/speed experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark).
|
||||
|
||||
---
|
||||
|
||||
## Tuning Parameters
|
||||
|
||||
| Parameter | CLI flag | Type | Default | Description |
|
||||
| --------------- | ----------------------------------------------- | -------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vcodec` | `--dataset.camera_encoder_config.vcodec` | `str` | `"libsvtav1"` | Video codec name. `"auto"` picks the first available hardware encoder from a fixed preference list, else `libsvtav1`. |
|
||||
| `pix_fmt` | `--dataset.camera_encoder_config.pix_fmt` | `str` | `"yuv420p"` | Output pixel format; must be supported by the specified codec in your FFmpeg build. |
|
||||
| `g` | `--dataset.camera_encoder_config.g` | `int \| None` | `2` | GOP size (keyframes every `g` frames). Emitted as FFmpeg option `g`. |
|
||||
| `crf` | `--dataset.camera_encoder_config.crf` | `int \| None` | `30` | Abstract **quality**; mapped per codec in the table below (CRF, QP, `q:v`, etc.). Lower → higher quality / larger output where the mapping is monotone. |
|
||||
| `preset` | `--dataset.camera_encoder_config.preset` | `int \| str \| None` | `12`\* | Video encoding speed preset; meaning depends on the specified codec. \*Unset + `libsvtav1` → LeRobot sets `12`. |
|
||||
| `fast_decode` | `--dataset.camera_encoder_config.fast_decode` | `int` | `0` | `libsvtav1`: `0–2` passed in `svtav1-params`; `h264` / `hevc` (software): if `>0`, sets `tune=fastdecode`; other codecs: often unused. |
|
||||
| `video_backend` | `--dataset.camera_encoder_config.video_backend` | `str` | `"pyav"` | Only `"pyav"` is implemented for video encoding today. |
|
||||
| `extra_options` | (nested config / non-scalar) | `dict` | `{}` | Extra FFmpeg options merged after the built-in mapping; **cannot** override keys already set from structured fields above. |
|
||||
|
||||
---
|
||||
|
||||
## Validation
|
||||
|
||||
| What | Behavior |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Video codec presence | `vcodec` must exist as a video encoder in the local FFmpeg build (after resolving `"auto"`). |
|
||||
| Pixel format | `pix_fmt` is checked against the encoder’s reported pixel formats when available. |
|
||||
| Options | `get_codec_options()` output (including values originating from `extra_options`) is checked against PyAV/FFmpeg option metadata (ranges, integer constraints, string choices) where applicable. |
|
||||
|
||||
---
|
||||
|
||||
## Mapping: `VideoEncoderConfig` → FFmpeg options
|
||||
|
||||
From **`get_codec_options()`** after `vcodec` resolution. Only fields on `camera_encoder_config` are listed here (no global thread / queue flags).
|
||||
|
||||
| Resolved `vcodec` | `g` | Quality from `crf` | `preset` | `fast_decode` |
|
||||
| ---------------------------------------- | --- | --------------------------- | -------- | ------------------------------------------ |
|
||||
| `libsvtav1` | `g` | `crf` | `preset` | `svtav1-params` includes `fast-decode=0…2` |
|
||||
| `h264`, `hevc` (software) | `g` | `crf` | `preset` | `tune=fastdecode` if `fast_decode > 0` |
|
||||
| `h264_videotoolbox`, `hevc_videotoolbox` | `g` | `q:v` (derived from `crf`) | — | — |
|
||||
| `h264_nvenc`, `hevc_nvenc` | `g` | `rc=constqp` + `qp` ← `crf` | `preset` | — |
|
||||
| `h264_vaapi` | `g` | `qp` ← `crf` | — | — |
|
||||
| `h264_qsv` | `g` | `global_quality` ← `crf` | `preset` | — |
|
||||
|
||||
---
|
||||
|
||||
## `extra_options`
|
||||
|
||||
- Merged **after** structured options; keys **already** set by `g`, `crf`, `preset`, etc. are **not** replaced by `extra_options`.
|
||||
- Values are strings or numbers as FFmpeg expects; numeric values are validated when the codec exposes option metadata.
|
||||
|
||||
---
|
||||
|
||||
## Example
|
||||
|
||||
```bash
|
||||
lerobot-record \
|
||||
--robot.type=so100_follower \
|
||||
--robot.port=/dev/tty.usbmodem58760431541 \
|
||||
--robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--robot.id=black \
|
||||
--teleop.type=so100_leader \
|
||||
--teleop.port=/dev/tty.usbmodem58760431551 \
|
||||
--teleop.id=blue \
|
||||
--dataset.repo_id=<my_username>/<my_dataset_name> \
|
||||
--dataset.num_episodes=2 \
|
||||
--dataset.single_task="Grab the cube" \
|
||||
--dataset.streaming_encoding=true \
|
||||
--dataset.encoder_threads=2 \
|
||||
--dataset.camera_encoder_config.vcodec=h264 \
|
||||
--dataset.camera_encoder_config.preset=fast \
|
||||
--dataset.camera_encoder_config.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \
|
||||
--display_data=true
|
||||
```
|
||||
@@ -31,6 +31,12 @@ from .types import (
|
||||
PolicyFeature,
|
||||
RTCAttentionSchedule,
|
||||
)
|
||||
from .video import (
|
||||
VALID_VIDEO_CODECS,
|
||||
VIDEO_ENCODER_INFO_KEYS,
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Types
|
||||
@@ -46,4 +52,10 @@ __all__ = [
|
||||
"PeftConfig",
|
||||
"PreTrainedConfig",
|
||||
"WandBConfig",
|
||||
"VideoEncoderConfig",
|
||||
# Defaults
|
||||
"camera_encoder_defaults",
|
||||
# Constants
|
||||
"VALID_VIDEO_CODECS",
|
||||
"VIDEO_ENCODER_INFO_KEYS",
|
||||
]
|
||||
|
||||
@@ -14,10 +14,12 @@
|
||||
|
||||
"""Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .video import VideoEncoderConfig, camera_encoder_defaults
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetRecordConfig:
|
||||
@@ -55,10 +57,9 @@ class DatasetRecordConfig:
|
||||
# Number of episodes to record before batch encoding videos
|
||||
# Set to 1 for immediate encoding (default behavior), or higher for batched encoding
|
||||
video_encoding_batch_size: int = 1
|
||||
# Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
|
||||
# or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
|
||||
# Use 'auto' to auto-detect the best available hardware encoder.
|
||||
vcodec: str = "libsvtav1"
|
||||
# Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
|
||||
# e.g. ``--dataset.camera_encoder_config.vcodec=h264`` (see ``VideoEncoderConfig``).
|
||||
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
# Enable streaming video encoding: encode frames in real-time during capture instead
|
||||
# of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
|
||||
streaming_encoding: bool = False
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.transforms import ImageTransformsConfig
|
||||
from lerobot.utils.import_utils import get_safe_default_codec
|
||||
from lerobot.utils.import_utils import get_safe_default_video_backend
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -34,7 +34,7 @@ class DatasetConfig:
|
||||
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
|
||||
revision: str | None = None
|
||||
use_imagenet_stats: bool = True
|
||||
video_backend: str = field(default_factory=get_safe_default_codec)
|
||||
video_backend: str = field(default_factory=get_safe_default_video_backend)
|
||||
# When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
|
||||
# This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
|
||||
return_uint8: bool = False
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Note: We subclass str so that serialization is straightforward
|
||||
# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
|
||||
|
||||
"""Video encoder configurations."""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from lerobot.utils.import_utils import _av_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _av_available:
|
||||
from lerobot.datasets.pyav_utils import check_video_encoder_config_pyav, detect_available_encoders_pyav
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend.
|
||||
# Determines the order of preference for auto-selection when vcodec="auto" is used.
|
||||
HW_VIDEO_CODECS = [
|
||||
"h264_videotoolbox", # macOS
|
||||
"hevc_videotoolbox", # macOS
|
||||
"h264_nvenc", # NVIDIA GPU
|
||||
"hevc_nvenc", # NVIDIA GPU
|
||||
"h264_vaapi", # Linux Intel/AMD
|
||||
"h264_qsv", # Intel Quick Sync
|
||||
]
|
||||
VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
|
||||
|
||||
LIBSVTAV1_DEFAULT_PRESET: int = 12
|
||||
|
||||
# Keys persisted under ``features[*]["info"]`` as ``video.<name>`` (from :class:`VideoEncoderConfig`).
|
||||
# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly.
|
||||
VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset(
|
||||
{"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"}
|
||||
)
|
||||
VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset(
|
||||
f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoEncoderConfig:
|
||||
"""Video encoder configuration.
|
||||
|
||||
Attributes:
|
||||
vcodec: Video encoder name. ``"auto"`` is resolved during
|
||||
construction (HW encoder if available, else ``libsvtav1``).
|
||||
pix_fmt: Pixel format (e.g. ``"yuv420p"``).
|
||||
g: GOP size (keyframe interval).
|
||||
crf: Quality level — mapped to the native quality parameter of the
|
||||
codec (``crf`` for software, ``qp`` for NVENC/VAAPI,
|
||||
``q:v`` for VideoToolbox, ``global_quality`` for QSV).
|
||||
preset: Speed/quality preset. Accepted type is per-codec.
|
||||
fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2)
|
||||
embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values
|
||||
set ``tune=fastdecode``. Ignored for other codecs.
|
||||
video_backend: Python to be used for encoding. Only ``"pyav"``
|
||||
is currently supported.
|
||||
extra_options: Free-form dictionary of additional video encoder options
|
||||
(e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``).
|
||||
"""
|
||||
|
||||
vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ?
|
||||
pix_fmt: str = "yuv420p"
|
||||
g: int | None = 2
|
||||
crf: int | None = 30
|
||||
preset: int | str | None = None
|
||||
fast_decode: int = 0
|
||||
# TODO(CarolinePascal): add torchcodec support + find a way to unify the
|
||||
# two backends (encoding and decoding).
|
||||
video_backend: str = "pyav"
|
||||
extra_options: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.resolve_vcodec()
|
||||
# Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work".
|
||||
if self.preset is None and self.vcodec == "libsvtav1":
|
||||
self.preset = LIBSVTAV1_DEFAULT_PRESET
|
||||
self.validate()
|
||||
|
||||
def detect_available_encoders(self, encoders: list[str] | str) -> list[str]:
|
||||
"""Return the subset of available encoders based on the specified video backend.
|
||||
|
||||
Args:
|
||||
encoders: List of encoder names to detect. If a string, it is converted to a list.
|
||||
Returns:
|
||||
List of available encoder names. If the video backend is not "pyav", returns an empty list.
|
||||
"""
|
||||
if self.video_backend == "pyav":
|
||||
require_package("av", extra="dataset")
|
||||
return detect_available_encoders_pyav(encoders)
|
||||
return []
|
||||
|
||||
def validate(self) -> None:
|
||||
"""Validate the video encoder configuration."""
|
||||
if self.video_backend == "pyav":
|
||||
require_package("av", extra="dataset")
|
||||
check_video_encoder_config_pyav(self)
|
||||
|
||||
def resolve_vcodec(self) -> None:
|
||||
"""Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder.
|
||||
|
||||
For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the
|
||||
resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``.
|
||||
"""
|
||||
if self.vcodec not in VALID_VIDEO_CODECS:
|
||||
raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
|
||||
if self.vcodec == "auto":
|
||||
available = self.detect_available_encoders(HW_VIDEO_CODECS)
|
||||
for encoder in HW_VIDEO_CODECS:
|
||||
if encoder in available:
|
||||
logger.info(f"Auto-selected video codec: {encoder}")
|
||||
self.vcodec = encoder
|
||||
return
|
||||
logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'")
|
||||
self.vcodec = "libsvtav1"
|
||||
|
||||
if self.detect_available_encoders(self.vcodec):
|
||||
logger.info(f"Using video codec: {self.vcodec}")
|
||||
return
|
||||
raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}")
|
||||
|
||||
def get_codec_options(
|
||||
self, encoder_threads: int | None = None, as_strings: bool = False
|
||||
) -> dict[str, Any]:
|
||||
"""Translate the tuning fields to codec-specific options.
|
||||
|
||||
``VideoEncoderConfig.extra_options`` are merged last but never override a structured field.
|
||||
|
||||
Args:
|
||||
encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs.
|
||||
For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``.
|
||||
For h264/hevc, this is mapped to ``threads``.
|
||||
Hardware encoders ignore this parameter.
|
||||
as_strings: If ``True``, casts values to strings.
|
||||
"""
|
||||
opts: dict[str, Any] = {}
|
||||
|
||||
def set_if(key: str, value: Any) -> None:
|
||||
if value is not None:
|
||||
opts[key] = value if not as_strings else str(value)
|
||||
|
||||
# GOP size is not a codec-specific option, so it is always set.
|
||||
set_if("g", self.g)
|
||||
|
||||
if self.vcodec == "libsvtav1":
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
svtav1_parts: list[str] = []
|
||||
if self.fast_decode is not None:
|
||||
svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}")
|
||||
if encoder_threads is not None:
|
||||
svtav1_parts.append(f"lp={encoder_threads}")
|
||||
if svtav1_parts:
|
||||
opts["svtav1-params"] = ":".join(svtav1_parts)
|
||||
elif self.vcodec in ("h264", "hevc"):
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
if self.fast_decode:
|
||||
opts["tune"] = "fastdecode"
|
||||
set_if("threads", encoder_threads)
|
||||
elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
|
||||
if self.crf is not None:
|
||||
opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
|
||||
elif self.vcodec in ("h264_nvenc", "hevc_nvenc"):
|
||||
opts["rc"] = "constqp"
|
||||
set_if("qp", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
elif self.vcodec == "h264_vaapi":
|
||||
set_if("qp", self.crf)
|
||||
elif self.vcodec == "h264_qsv":
|
||||
set_if("global_quality", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
else:
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
|
||||
# Extra options are merged last but never override structured fields (values are kept as given).
|
||||
for k, v in self.extra_options.items():
|
||||
if k not in opts:
|
||||
set_if(k, v)
|
||||
|
||||
return opts
|
||||
|
||||
|
||||
def camera_encoder_defaults() -> VideoEncoderConfig:
|
||||
"""Return a :class:`VideoEncoderConfig` with RGB-camera defaults."""
|
||||
return VideoEncoderConfig()
|
||||
@@ -40,10 +40,19 @@ from .io_utils import load_episodes, write_stats
|
||||
from .lerobot_dataset import LeRobotDataset
|
||||
from .multi_dataset import MultiLeRobotDataset
|
||||
from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
|
||||
from .pyav_utils import (
|
||||
check_video_encoder_config_pyav,
|
||||
detect_available_encoders_pyav,
|
||||
get_codec,
|
||||
)
|
||||
from .sampler import EpisodeAwareSampler
|
||||
from .streaming_dataset import StreamingLeRobotDataset
|
||||
from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card
|
||||
from .video_utils import VideoEncodingManager
|
||||
from .video_utils import (
|
||||
VideoEncoderConfig,
|
||||
VideoEncodingManager,
|
||||
camera_encoder_defaults,
|
||||
)
|
||||
|
||||
# NOTE: Low-level I/O functions (cast_stats_to_numpy, get_parquet_file_size_in_mb, etc.)
|
||||
# and legacy migration constants are intentionally NOT re-exported here.
|
||||
@@ -58,15 +67,20 @@ __all__ = [
|
||||
"LeRobotDatasetMetadata",
|
||||
"MultiLeRobotDataset",
|
||||
"StreamingLeRobotDataset",
|
||||
"VideoEncoderConfig",
|
||||
"VideoEncodingManager",
|
||||
"camera_encoder_defaults",
|
||||
"add_features",
|
||||
"aggregate_datasets",
|
||||
"aggregate_pipeline_dataset_features",
|
||||
"aggregate_stats",
|
||||
"check_video_encoder_config_pyav",
|
||||
"convert_image_to_video_dataset",
|
||||
"create_initial_features",
|
||||
"create_lerobot_dataset_card",
|
||||
"delete_episodes",
|
||||
"detect_available_encoders_pyav",
|
||||
"get_codec",
|
||||
"get_feature_stats",
|
||||
"load_episodes",
|
||||
"make_dataset",
|
||||
|
||||
@@ -332,7 +332,6 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
|
||||
videos_idx: Dictionary tracking video chunk and file indices.
|
||||
video_files_size_in_mb: Maximum size for video files in MB (defaults to DEFAULT_VIDEO_FILE_SIZE_IN_MB)
|
||||
chunk_size: Maximum number of files per chunk (defaults to DEFAULT_CHUNK_SIZE)
|
||||
|
||||
Returns:
|
||||
dict: Updated videos_idx with current chunk and file indices.
|
||||
"""
|
||||
@@ -417,6 +416,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
|
||||
concatenate_video_files(
|
||||
[dst_path, src_path],
|
||||
dst_path,
|
||||
compatibility_check=True,
|
||||
)
|
||||
# Update duration of this destination file
|
||||
dst_file_durations[dst_key] = current_dst_duration + src_duration
|
||||
|
||||
@@ -48,7 +48,7 @@ from .utils import (
|
||||
is_valid_version,
|
||||
update_chunk_file_indices,
|
||||
)
|
||||
from .video_utils import get_video_info
|
||||
from .video_utils import VideoEncoderConfig, get_video_info
|
||||
|
||||
CODEBASE_VERSION = "v3.0"
|
||||
|
||||
@@ -510,10 +510,23 @@ class LeRobotDatasetMetadata:
|
||||
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
|
||||
write_stats(self.stats, self.root)
|
||||
|
||||
def update_video_info(self, video_key: str | None = None) -> None:
|
||||
"""
|
||||
def update_video_info(
|
||||
self,
|
||||
video_key: str | None = None,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
) -> None:
|
||||
"""Populate per-feature video info in ``info.json``.
|
||||
|
||||
Warning: this function writes info from first episode videos, implicitly assuming that all videos have
|
||||
been encoded the same way. Also, this means it assumes the first episode exists.
|
||||
|
||||
Args:
|
||||
video_key: If provided, only update this video key. Otherwise update
|
||||
all video keys in the dataset.
|
||||
camera_encoder_config: Encoder configuration used to produce the
|
||||
videos. When provided, its fields are recorded as
|
||||
``video.<field>`` entries alongside the stream-derived
|
||||
``video.*`` entries (see :func:`get_video_info`).
|
||||
"""
|
||||
if video_key is not None and video_key not in self.video_keys:
|
||||
raise ValueError(f"Video key {video_key} not found in dataset")
|
||||
@@ -522,7 +535,9 @@ class LeRobotDatasetMetadata:
|
||||
for key in video_keys:
|
||||
if not self.features[key].get("info", None):
|
||||
video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
|
||||
self.info.features[key]["info"] = get_video_info(video_path)
|
||||
self.info.features[key]["info"] = get_video_info(
|
||||
video_path, camera_encoder_config=camera_encoder_config
|
||||
)
|
||||
|
||||
def update_chunk_settings(
|
||||
self,
|
||||
|
||||
@@ -62,7 +62,12 @@ from .utils import (
|
||||
DEFAULT_EPISODES_PATH,
|
||||
update_chunk_file_indices,
|
||||
)
|
||||
from .video_utils import encode_video_frames, get_video_info
|
||||
from .video_utils import (
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
encode_video_frames,
|
||||
get_video_info,
|
||||
)
|
||||
|
||||
|
||||
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
|
||||
@@ -92,6 +97,7 @@ def delete_episodes(
|
||||
episode_indices: list[int],
|
||||
output_dir: str | Path | None = None,
|
||||
repo_id: str | None = None,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
) -> LeRobotDataset:
|
||||
"""Delete episodes from a LeRobotDataset and create a new dataset.
|
||||
|
||||
@@ -100,6 +106,8 @@ def delete_episodes(
|
||||
episode_indices: List of episode indices to delete.
|
||||
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
|
||||
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
|
||||
camera_encoder_config: Video encoder settings used when re-encoding video segments
|
||||
(``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`).
|
||||
"""
|
||||
if not episode_indices:
|
||||
raise ValueError("No episodes to delete")
|
||||
@@ -132,7 +140,7 @@ def delete_episodes(
|
||||
|
||||
video_metadata = None
|
||||
if dataset.meta.video_keys:
|
||||
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
|
||||
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder_config)
|
||||
|
||||
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
|
||||
|
||||
@@ -154,6 +162,7 @@ def split_dataset(
|
||||
dataset: LeRobotDataset,
|
||||
splits: dict[str, float | list[int]],
|
||||
output_dir: str | Path | None = None,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
) -> dict[str, LeRobotDataset]:
|
||||
"""Split a LeRobotDataset into multiple smaller datasets.
|
||||
|
||||
@@ -162,6 +171,8 @@ def split_dataset(
|
||||
splits: Either a dict mapping split names to episode indices, or a dict mapping
|
||||
split names to fractions (must sum to <= 1.0).
|
||||
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
|
||||
camera_encoder_config: Video encoder settings used when re-encoding video segments
|
||||
(``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`).
|
||||
|
||||
Examples:
|
||||
Split by specific episodes
|
||||
@@ -222,7 +233,9 @@ def split_dataset(
|
||||
|
||||
video_metadata = None
|
||||
if dataset.meta.video_keys:
|
||||
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
|
||||
video_metadata = _copy_and_reindex_videos(
|
||||
dataset, new_meta, episode_mapping, camera_encoder_config
|
||||
)
|
||||
|
||||
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
|
||||
|
||||
@@ -578,8 +591,7 @@ def _keep_episodes_from_video_with_av(
|
||||
output_path: Path,
|
||||
episodes_to_keep: list[tuple[int, int]],
|
||||
fps: float,
|
||||
vcodec: str = "libsvtav1",
|
||||
pix_fmt: str = "yuv420p",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
) -> None:
|
||||
"""Keep only specified episodes from a video file using PyAV.
|
||||
|
||||
@@ -593,9 +605,11 @@ def _keep_episodes_from_video_with_av(
|
||||
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
|
||||
is inclusive and end_frame is exclusive.
|
||||
fps: Frame rate of the video.
|
||||
vcodec: Video codec to use for encoding.
|
||||
pix_fmt: Pixel format for output video.
|
||||
camera_encoder_config: Video encoder settings
|
||||
(``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`).
|
||||
"""
|
||||
if camera_encoder_config is None:
|
||||
camera_encoder_config = camera_encoder_defaults()
|
||||
from fractions import Fraction
|
||||
|
||||
import av
|
||||
@@ -619,12 +633,12 @@ def _keep_episodes_from_video_with_av(
|
||||
|
||||
# Convert fps to Fraction for PyAV compatibility.
|
||||
fps_fraction = Fraction(fps).limit_denominator(1000)
|
||||
v_out = out.add_stream(vcodec, rate=fps_fraction)
|
||||
v_out = out.add_stream(camera_encoder_config.vcodec, rate=fps_fraction)
|
||||
|
||||
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
|
||||
v_out.width = v_in.codec_context.width
|
||||
v_out.height = v_in.codec_context.height
|
||||
v_out.pix_fmt = pix_fmt
|
||||
v_out.pix_fmt = camera_encoder_config.pix_fmt
|
||||
|
||||
# Set time_base to match the frame rate for proper timestamp handling.
|
||||
v_out.time_base = Fraction(1, int(fps))
|
||||
@@ -687,8 +701,7 @@ def _copy_and_reindex_videos(
|
||||
src_dataset: LeRobotDataset,
|
||||
dst_meta: LeRobotDatasetMetadata,
|
||||
episode_mapping: dict[int, int],
|
||||
vcodec: str = "libsvtav1",
|
||||
pix_fmt: str = "yuv420p",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
) -> dict[int, dict]:
|
||||
"""Copy and filter video files, only re-encoding files with deleted episodes.
|
||||
|
||||
@@ -700,10 +713,14 @@ def _copy_and_reindex_videos(
|
||||
src_dataset: Source dataset to copy from
|
||||
dst_meta: Destination metadata object
|
||||
episode_mapping: Mapping from old episode indices to new indices
|
||||
camera_encoder_config: Video encoder settings used when re-encoding segments
|
||||
(``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`).
|
||||
|
||||
Returns:
|
||||
dict mapping episode index to its video metadata (chunk_index, file_index, timestamps)
|
||||
"""
|
||||
if camera_encoder_config is None:
|
||||
camera_encoder_config = camera_encoder_defaults()
|
||||
if src_dataset.meta.episodes is None:
|
||||
src_dataset.meta.episodes = load_episodes(src_dataset.meta.root)
|
||||
|
||||
@@ -792,8 +809,7 @@ def _copy_and_reindex_videos(
|
||||
dst_video_path,
|
||||
episodes_to_keep_ranges,
|
||||
src_dataset.meta.fps,
|
||||
vcodec,
|
||||
pix_fmt,
|
||||
camera_encoder_config,
|
||||
)
|
||||
|
||||
cumulative_ts = 0.0
|
||||
@@ -1264,11 +1280,7 @@ def _estimate_frame_size_via_calibration(
|
||||
episode_indices: list[int],
|
||||
temp_dir: Path,
|
||||
fps: int,
|
||||
vcodec: str,
|
||||
pix_fmt: str,
|
||||
g: int,
|
||||
crf: int,
|
||||
fast_decode: int,
|
||||
camera_encoder_config: VideoEncoderConfig,
|
||||
num_calibration_frames: int = 30,
|
||||
) -> float:
|
||||
"""Estimate MB per frame by encoding a small calibration sample.
|
||||
@@ -1282,11 +1294,7 @@ def _estimate_frame_size_via_calibration(
|
||||
episode_indices: List of episode indices being processed.
|
||||
temp_dir: Temporary directory for calibration files.
|
||||
fps: Frames per second for video encoding.
|
||||
vcodec: Video codec (libsvtav1, h264, hevc).
|
||||
pix_fmt: Pixel format (yuv420p, etc.).
|
||||
g: GOP size (group of pictures).
|
||||
crf: Constant Rate Factor (quality).
|
||||
fast_decode: Fast decode tuning parameter.
|
||||
camera_encoder_config: Video encoder settings used for calibration encoding.
|
||||
num_calibration_frames: Number of frames to use for calibration (default: 30).
|
||||
|
||||
Returns:
|
||||
@@ -1322,11 +1330,7 @@ def _estimate_frame_size_via_calibration(
|
||||
imgs_dir=calibration_dir,
|
||||
video_path=calibration_video_path,
|
||||
fps=fps,
|
||||
vcodec=vcodec,
|
||||
pix_fmt=pix_fmt,
|
||||
g=g,
|
||||
crf=crf,
|
||||
fast_decode=fast_decode,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
@@ -1644,11 +1648,7 @@ def convert_image_to_video_dataset(
|
||||
dataset: LeRobotDataset,
|
||||
output_dir: Path | None = None,
|
||||
repo_id: str | None = None,
|
||||
vcodec: str = "libsvtav1",
|
||||
pix_fmt: str = "yuv420p",
|
||||
g: int = 2,
|
||||
crf: int = 30,
|
||||
fast_decode: int = 0,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
episode_indices: list[int] | None = None,
|
||||
num_workers: int = 4,
|
||||
max_episodes_per_batch: int | None = None,
|
||||
@@ -1663,11 +1663,8 @@ def convert_image_to_video_dataset(
|
||||
dataset: The source LeRobot dataset with images
|
||||
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
|
||||
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
|
||||
vcodec: Video codec (default: libsvtav1)
|
||||
pix_fmt: Pixel format (default: yuv420p)
|
||||
g: Group of pictures size (default: 2)
|
||||
crf: Constant rate factor (default: 30)
|
||||
fast_decode: Fast decode tuning (default: 0)
|
||||
camera_encoder_config: Video encoder settings
|
||||
(``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`).
|
||||
episode_indices: List of episode indices to convert (None = all episodes)
|
||||
num_workers: Number of threads for parallel processing (default: 4)
|
||||
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
|
||||
@@ -1676,6 +1673,9 @@ def convert_image_to_video_dataset(
|
||||
Returns:
|
||||
New LeRobotDataset with images encoded as videos
|
||||
"""
|
||||
if camera_encoder_config is None:
|
||||
camera_encoder_config = camera_encoder_defaults()
|
||||
|
||||
# Check that it's an image dataset
|
||||
if len(dataset.meta.video_keys) > 0:
|
||||
raise ValueError(
|
||||
@@ -1699,7 +1699,10 @@ def convert_image_to_video_dataset(
|
||||
logging.info(
|
||||
f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
|
||||
)
|
||||
logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}")
|
||||
logging.info(
|
||||
f"Video codec: {camera_encoder_config.vcodec}, pixel format: {camera_encoder_config.pix_fmt}, "
|
||||
f"GOP: {camera_encoder_config.g}, CRF: {camera_encoder_config.crf}"
|
||||
)
|
||||
|
||||
# Create new features dict, converting image features to video features
|
||||
new_features = {}
|
||||
@@ -1769,11 +1772,7 @@ def convert_image_to_video_dataset(
|
||||
episode_indices=episode_indices,
|
||||
temp_dir=temp_dir,
|
||||
fps=fps,
|
||||
vcodec=vcodec,
|
||||
pix_fmt=pix_fmt,
|
||||
g=g,
|
||||
crf=crf,
|
||||
fast_decode=fast_decode,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
)
|
||||
|
||||
logging.info(f"Processing camera: {img_key}")
|
||||
@@ -1815,11 +1814,7 @@ def convert_image_to_video_dataset(
|
||||
imgs_dir=imgs_dir,
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
vcodec=vcodec,
|
||||
pix_fmt=pix_fmt,
|
||||
g=g,
|
||||
crf=crf,
|
||||
fast_decode=fast_decode,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
@@ -1865,7 +1860,9 @@ def convert_image_to_video_dataset(
|
||||
video_path = new_meta.root / new_meta.video_path.format(
|
||||
video_key=img_key, chunk_index=0, file_index=0
|
||||
)
|
||||
new_meta.info.features[img_key]["info"] = get_video_info(video_path)
|
||||
new_meta.info.features[img_key]["info"] = get_video_info(
|
||||
video_path, camera_encoder_config=camera_encoder_config
|
||||
)
|
||||
|
||||
write_info(new_meta.info, new_meta.root)
|
||||
|
||||
|
||||
@@ -52,6 +52,8 @@ from .utils import (
|
||||
)
|
||||
from .video_utils import (
|
||||
StreamingVideoEncoder,
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
concatenate_video_files,
|
||||
encode_video_frames,
|
||||
get_video_duration_in_s,
|
||||
@@ -65,14 +67,19 @@ def _encode_video_worker(
|
||||
episode_index: int,
|
||||
root: Path,
|
||||
fps: int,
|
||||
vcodec: str = "libsvtav1",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
encoder_threads: int | None = None,
|
||||
) -> Path:
|
||||
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
|
||||
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
|
||||
img_dir = (root / fpath).parent
|
||||
encode_video_frames(
|
||||
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
|
||||
img_dir,
|
||||
temp_path,
|
||||
fps,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
encoder_threads=encoder_threads,
|
||||
overwrite=True,
|
||||
)
|
||||
shutil.rmtree(img_dir)
|
||||
return temp_path
|
||||
@@ -89,20 +96,22 @@ class DatasetWriter:
|
||||
self,
|
||||
meta: LeRobotDatasetMetadata,
|
||||
root: Path,
|
||||
vcodec: str,
|
||||
camera_encoder_config: VideoEncoderConfig | None,
|
||||
encoder_threads: int | None,
|
||||
batch_encoding_size: int,
|
||||
streaming_encoder: StreamingVideoEncoder | None = None,
|
||||
initial_frames: int = 0,
|
||||
):
|
||||
"""Initialize the writer with metadata, codec, and encoding config.
|
||||
"""Initialize the writer with metadata, codec, and encoder config.
|
||||
|
||||
Args:
|
||||
meta: Dataset metadata instance (used for feature schema, chunk
|
||||
settings, and episode persistence).
|
||||
root: Local dataset root directory.
|
||||
vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
|
||||
encoder_threads: Threads per encoder instance. ``None`` for auto.
|
||||
camera_encoder_config: Video encoder settings applied to all cameras.
|
||||
``None`` uses :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`.
|
||||
encoder_threads: Number of encoder threads (global). ``None``
|
||||
lets the codec decide.
|
||||
batch_encoding_size: Number of episodes to accumulate before
|
||||
batch-encoding videos.
|
||||
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
|
||||
@@ -111,7 +120,7 @@ class DatasetWriter:
|
||||
"""
|
||||
self._meta = meta
|
||||
self._root = root
|
||||
self._vcodec = vcodec
|
||||
self._camera_encoder_config = camera_encoder_config or camera_encoder_defaults()
|
||||
self._encoder_threads = encoder_threads
|
||||
self._batch_encoding_size = batch_encoding_size
|
||||
self._streaming_encoder = streaming_encoder
|
||||
@@ -284,7 +293,7 @@ class DatasetWriter:
|
||||
episode_index,
|
||||
self._root,
|
||||
self._meta.fps,
|
||||
self._vcodec,
|
||||
self._camera_encoder_config,
|
||||
self._encoder_threads,
|
||||
): video_key
|
||||
for video_key in self._meta.video_keys
|
||||
@@ -495,7 +504,7 @@ class DatasetWriter:
|
||||
|
||||
# Update video info (only needed when first episode is encoded)
|
||||
if episode_index == 0:
|
||||
self._meta.update_video_info(video_key)
|
||||
self._meta.update_video_info(video_key, camera_encoder_config=self._camera_encoder_config)
|
||||
write_info(self._meta.info, self._meta.root)
|
||||
|
||||
metadata = {
|
||||
@@ -564,7 +573,12 @@ class DatasetWriter:
|
||||
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
|
||||
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
|
||||
return _encode_video_worker(
|
||||
video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
|
||||
video_key,
|
||||
episode_index,
|
||||
self._root,
|
||||
self._meta.fps,
|
||||
self._camera_encoder_config,
|
||||
self._encoder_threads,
|
||||
)
|
||||
|
||||
def close_writer(self) -> None:
|
||||
|
||||
@@ -36,8 +36,8 @@ from .utils import (
|
||||
)
|
||||
from .video_utils import (
|
||||
StreamingVideoEncoder,
|
||||
get_safe_default_codec,
|
||||
resolve_vcodec,
|
||||
VideoEncoderConfig,
|
||||
get_safe_default_video_backend,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -58,10 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
video_backend: str | None = None,
|
||||
return_uint8: bool = False,
|
||||
batch_encoding_size: int = 1,
|
||||
vcodec: str = "libsvtav1",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
encoder_threads: int | None = None,
|
||||
streaming_encoding: bool = False,
|
||||
encoder_queue_maxsize: int = 30,
|
||||
encoder_threads: int | None = None,
|
||||
):
|
||||
"""
|
||||
2 modes are available for instantiating this class, depending on 2 different use cases:
|
||||
@@ -177,16 +177,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
|
||||
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
|
||||
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
|
||||
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
|
||||
'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
|
||||
Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
|
||||
camera_encoder_config (VideoEncoderConfig | None, optional): Video encoder settings for cameras
|
||||
(codec, quality, etc.). When ``None``, :func:`~lerobot.datasets.video_utils.camera_encoder_defaults`
|
||||
is used by the writer.
|
||||
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
|
||||
codec decide.
|
||||
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
|
||||
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
|
||||
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
|
||||
streaming encoding. Defaults to 30 (~1s at 30fps).
|
||||
encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
|
||||
codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
|
||||
libsvtav1 and 'threads' for h264/hevc.
|
||||
|
||||
Note:
|
||||
Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
|
||||
@@ -202,10 +201,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
self.episodes = episodes
|
||||
self.tolerance_s = tolerance_s
|
||||
self.revision = revision if revision else CODEBASE_VERSION
|
||||
self._video_backend = video_backend if video_backend else get_safe_default_codec()
|
||||
self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
|
||||
self._return_uint8 = return_uint8
|
||||
self._batch_encoding_size = batch_encoding_size
|
||||
self._vcodec = resolve_vcodec(vcodec)
|
||||
self._encoder_threads = encoder_threads
|
||||
|
||||
if self._requested_root is not None:
|
||||
@@ -251,12 +249,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
streaming_enc = None
|
||||
if streaming_encoding and len(self.meta.video_keys) > 0:
|
||||
streaming_enc = self._build_streaming_encoder(
|
||||
self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads
|
||||
self.meta.fps,
|
||||
camera_encoder_config,
|
||||
encoder_queue_maxsize,
|
||||
encoder_threads,
|
||||
)
|
||||
self.writer = DatasetWriter(
|
||||
meta=self.meta,
|
||||
root=self.root,
|
||||
vcodec=self._vcodec,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
encoder_threads=encoder_threads,
|
||||
batch_encoding_size=batch_encoding_size,
|
||||
streaming_encoder=streaming_enc,
|
||||
@@ -298,17 +299,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
@staticmethod
|
||||
def _build_streaming_encoder(
|
||||
fps: int,
|
||||
vcodec: str,
|
||||
camera_encoder_config: VideoEncoderConfig | None,
|
||||
encoder_queue_maxsize: int,
|
||||
encoder_threads: int | None,
|
||||
) -> StreamingVideoEncoder:
|
||||
return StreamingVideoEncoder(
|
||||
fps=fps,
|
||||
vcodec=vcodec,
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
preset=None,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
queue_maxsize=encoder_queue_maxsize,
|
||||
encoder_threads=encoder_threads,
|
||||
)
|
||||
@@ -625,7 +622,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
image_writer_threads: int = 0,
|
||||
video_backend: str | None = None,
|
||||
batch_encoding_size: int = 1,
|
||||
vcodec: str = "libsvtav1",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
metadata_buffer_size: int = 10,
|
||||
streaming_encoding: bool = False,
|
||||
encoder_queue_maxsize: int = 30,
|
||||
@@ -656,20 +653,20 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
video_backend: Video decoding backend (used when reading back).
|
||||
batch_encoding_size: Number of episodes to accumulate before
|
||||
batch-encoding videos. ``1`` means encode immediately.
|
||||
vcodec: Video codec for encoding. Options include ``'libsvtav1'``,
|
||||
``'h264'``, ``'hevc'``, ``'auto'``.
|
||||
camera_encoder_config: Video encoder settings for cameras (codec, quality, etc.).
|
||||
When ``None``, :func:`~lerobot.datasets.video_utils.camera_encoder_defaults` is used.
|
||||
encoder_threads: Number of encoder threads (global). ``None``
|
||||
lets the codec decide.
|
||||
metadata_buffer_size: Number of episode metadata records to buffer
|
||||
before flushing to parquet.
|
||||
streaming_encoding: If ``True``, encode video frames in real-time
|
||||
during capture instead of writing images first.
|
||||
encoder_queue_maxsize: Max buffered frames per camera when using
|
||||
streaming encoding.
|
||||
encoder_threads: Threads per encoder instance. ``None`` for auto.
|
||||
|
||||
Returns:
|
||||
A new :class:`LeRobotDataset` in write mode.
|
||||
"""
|
||||
vcodec = resolve_vcodec(vcodec)
|
||||
obj = cls.__new__(cls)
|
||||
obj.meta = LeRobotDatasetMetadata.create(
|
||||
repo_id=repo_id,
|
||||
@@ -690,23 +687,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
obj.image_transforms = None
|
||||
obj.delta_timestamps = None
|
||||
obj.episodes = None
|
||||
obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
|
||||
obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
|
||||
obj._return_uint8 = False
|
||||
obj._batch_encoding_size = batch_encoding_size
|
||||
obj._vcodec = vcodec
|
||||
obj._encoder_threads = encoder_threads
|
||||
|
||||
# Reader is lazily created on first access (write-only mode)
|
||||
obj.reader = None
|
||||
|
||||
# Create writer
|
||||
streaming_enc = None
|
||||
if streaming_encoding and len(obj.meta.video_keys) > 0:
|
||||
streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads)
|
||||
streaming_enc = cls._build_streaming_encoder(
|
||||
fps, camera_encoder_config, encoder_queue_maxsize, encoder_threads
|
||||
)
|
||||
obj.writer = DatasetWriter(
|
||||
meta=obj.meta,
|
||||
root=obj.root,
|
||||
vcodec=vcodec,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
encoder_threads=encoder_threads,
|
||||
batch_encoding_size=batch_encoding_size,
|
||||
streaming_encoder=streaming_enc,
|
||||
@@ -729,12 +726,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
force_cache_sync: bool = False,
|
||||
video_backend: str | None = None,
|
||||
batch_encoding_size: int = 1,
|
||||
vcodec: str = "libsvtav1",
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
encoder_threads: int | None = None,
|
||||
image_writer_processes: int = 0,
|
||||
image_writer_threads: int = 0,
|
||||
streaming_encoding: bool = False,
|
||||
encoder_queue_maxsize: int = 30,
|
||||
encoder_threads: int | None = None,
|
||||
) -> "LeRobotDataset":
|
||||
"""Resume recording on an existing dataset.
|
||||
|
||||
@@ -757,13 +754,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
video_backend: Video decoding backend for reading back data.
|
||||
batch_encoding_size: Number of episodes to accumulate before
|
||||
batch-encoding videos.
|
||||
vcodec: Video codec for encoding.
|
||||
camera_encoder_config: Video encoder settings for cameras (codec, quality, etc.).
|
||||
When ``None``, :func:`~lerobot.datasets.video_utils.camera_encoder_defaults` is used.
|
||||
encoder_threads: Number of encoder threads (global). ``None``
|
||||
lets the codec decide.
|
||||
image_writer_processes: Subprocesses for async image writing.
|
||||
image_writer_threads: Threads for async image writing.
|
||||
streaming_encoding: If ``True``, encode video in real-time during
|
||||
capture.
|
||||
encoder_queue_maxsize: Max buffered frames per camera for streaming.
|
||||
encoder_threads: Threads per encoder instance. ``None`` for auto.
|
||||
|
||||
Returns:
|
||||
A :class:`LeRobotDataset` in write mode, ready to append episodes.
|
||||
@@ -774,7 +773,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
"Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
|
||||
"the shared cache. Please provide a local directory path."
|
||||
)
|
||||
vcodec = resolve_vcodec(vcodec)
|
||||
obj = cls.__new__(cls)
|
||||
obj.repo_id = repo_id
|
||||
obj._requested_root = Path(root)
|
||||
@@ -783,11 +781,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
obj.image_transforms = None
|
||||
obj.delta_timestamps = None
|
||||
obj.episodes = None
|
||||
obj._video_backend = video_backend if video_backend else get_safe_default_codec()
|
||||
obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
|
||||
obj._return_uint8 = False
|
||||
obj._batch_encoding_size = batch_encoding_size
|
||||
obj._vcodec = vcodec
|
||||
obj._encoder_threads = encoder_threads
|
||||
|
||||
if obj._requested_root is not None:
|
||||
obj._requested_root.mkdir(exist_ok=True, parents=True)
|
||||
@@ -796,21 +792,22 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
obj.meta = LeRobotDatasetMetadata(
|
||||
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
|
||||
)
|
||||
|
||||
obj._encoder_threads = encoder_threads
|
||||
obj.root = obj.meta.root
|
||||
|
||||
# Reader is lazily created on first access (write-only mode)
|
||||
obj.reader = None
|
||||
|
||||
# Create writer for appending
|
||||
streaming_enc = None
|
||||
if streaming_encoding and len(obj.meta.video_keys) > 0:
|
||||
streaming_enc = cls._build_streaming_encoder(
|
||||
obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads
|
||||
obj.meta.fps, camera_encoder_config, encoder_queue_maxsize, encoder_threads
|
||||
)
|
||||
obj.writer = DatasetWriter(
|
||||
meta=obj.meta,
|
||||
root=obj.root,
|
||||
vcodec=vcodec,
|
||||
camera_encoder_config=camera_encoder_config,
|
||||
encoder_threads=encoder_threads,
|
||||
batch_encoding_size=batch_encoding_size,
|
||||
streaming_encoder=streaming_enc,
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyAV-based compatibility checks for :class:`VideoEncoderConfig`.
|
||||
|
||||
Centralises all :mod:`av` introspection of the bundled FFmpeg build.
|
||||
Checks degrade to a no-op when the target codec isn't available locally.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import av
|
||||
|
||||
from lerobot.configs.video import VideoEncoderConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE")
|
||||
FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64")
|
||||
|
||||
|
||||
@functools.cache
|
||||
def get_codec(vcodec: str) -> av.codec.Codec | None:
|
||||
"""PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable."""
|
||||
try:
|
||||
return av.codec.Codec(vcodec, "w")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@functools.cache
|
||||
def _get_codec_options_by_name(vcodec: str) -> dict[str, av.option.Option]:
|
||||
"""Private-option name → PyAV ``Option`` for *vcodec* (empty if unavailable)."""
|
||||
codec = get_codec(vcodec)
|
||||
if codec is None:
|
||||
return {}
|
||||
return {opt.name: opt for opt in codec.descriptor.options}
|
||||
|
||||
|
||||
@functools.cache
|
||||
def _get_codec_video_formats(vcodec: str) -> tuple[str, ...]:
|
||||
"""Pixel formats accepted by *vcodec* in PyAV's preferred order (empty if unknown)."""
|
||||
codec = get_codec(vcodec)
|
||||
if codec is None:
|
||||
return ()
|
||||
return tuple(fmt.name for fmt in (codec.video_formats or []))
|
||||
|
||||
|
||||
def detect_available_encoders_pyav(encoders: list[str] | str) -> list[str]:
|
||||
"""Return the subset of *encoders* available as video encoders in the local FFmpeg build.
|
||||
|
||||
Each name is probed directly via :func:`get_codec`; input order is preserved.
|
||||
"""
|
||||
if isinstance(encoders, str):
|
||||
encoders = [encoders]
|
||||
|
||||
available: list[str] = []
|
||||
for name in encoders:
|
||||
codec = get_codec(name)
|
||||
if codec is not None and codec.type == "video":
|
||||
available.append(name)
|
||||
else:
|
||||
logger.debug("encoder '%s' not available as video encoder", name)
|
||||
return available
|
||||
|
||||
|
||||
def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Option) -> None:
|
||||
"""Range-check numeric *value* and choice-check string *value* against *opt*."""
|
||||
type_name = opt.type.name
|
||||
if type_name in FFMPEG_NUMERIC_OPTION_TYPES:
|
||||
if isinstance(value, bool):
|
||||
raise ValueError(
|
||||
f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
|
||||
)
|
||||
elif isinstance(value, str):
|
||||
try:
|
||||
num_val = float(value)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
|
||||
) from e
|
||||
elif isinstance(value, (float, int)):
|
||||
num_val = value
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
|
||||
)
|
||||
|
||||
# Check integer type compatibility
|
||||
if type_name in FFMPEG_INTEGER_OPTION_TYPES and not num_val.is_integer():
|
||||
raise ValueError(
|
||||
f"{label}={num_val!r} must be an integer for codec {vcodec!r} "
|
||||
f"(FFmpeg option {opt.name!r} is {type_name}); float values are not allowed."
|
||||
)
|
||||
|
||||
# Check numeric range compatibility
|
||||
lo, hi = float(opt.min), float(opt.max)
|
||||
if lo < hi and not (lo <= num_val <= hi):
|
||||
raise ValueError(
|
||||
f"{label}={num_val} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]"
|
||||
)
|
||||
|
||||
elif type_name == "STRING":
|
||||
if isinstance(value, bool):
|
||||
raise ValueError(f"{label}={value!r} is not a valid string value for codec {vcodec!r}.")
|
||||
if isinstance(value, str):
|
||||
str_val = value
|
||||
elif isinstance(value, (int, float)):
|
||||
str_val = str(value)
|
||||
else:
|
||||
raise ValueError(f"{label}={value!r} has unsupported type for STRING option on codec {vcodec!r}")
|
||||
|
||||
# Check string choice compatibility
|
||||
choices = [c.name for c in (opt.choices or [])]
|
||||
if choices and str_val not in choices:
|
||||
raise ValueError(
|
||||
f"{label}={str_val!r} is not a supported choice for codec "
|
||||
f"{vcodec!r}; valid choices: {choices}"
|
||||
)
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def _check_pixel_format(vcodec: str, pix_fmt: str) -> None:
|
||||
formats = _get_codec_video_formats(vcodec)
|
||||
if formats and pix_fmt not in formats:
|
||||
raise ValueError(
|
||||
f"pix_fmt={pix_fmt!r} is not supported by codec {vcodec!r}; "
|
||||
f"supported pixel formats: {list(formats)}"
|
||||
)
|
||||
|
||||
|
||||
def _check_codec_options(vcodec: str, codec_options: dict[str, Any], config: VideoEncoderConfig) -> None:
|
||||
"""Validate merged encoder options (typed) against the codec's published AVOptions."""
|
||||
supported_options = _get_codec_options_by_name(vcodec)
|
||||
for key, value in codec_options.items():
|
||||
# GOP size is not a codec-specific option, it has to be validated separately.
|
||||
if key == "g":
|
||||
if isinstance(value, bool) or not isinstance(value, int) or value < 1:
|
||||
raise ValueError(f"g={value!r} must be a positive integer for codec {vcodec!r}")
|
||||
continue
|
||||
if key not in supported_options:
|
||||
continue
|
||||
opt = supported_options[key]
|
||||
label = f"extra_options[{key!r}]" if key in config.extra_options else key
|
||||
_check_option_value(vcodec, label, value, opt)
|
||||
|
||||
|
||||
def check_video_encoder_config_pyav(config: VideoEncoderConfig) -> None:
|
||||
"""Verify *config* is compatible with the bundled FFmpeg build.
|
||||
|
||||
Checks pixel format, abstract tuning-field compatibility, and each merged
|
||||
encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options`
|
||||
against PyAV (including numeric ``extra_options`` present in that dict).
|
||||
No-op when ``config.vcodec`` isn't in the local FFmpeg build.
|
||||
|
||||
Raises:
|
||||
ValueError: on the first incompatibility encountered.
|
||||
"""
|
||||
vcodec = config.vcodec
|
||||
options = _get_codec_options_by_name(vcodec)
|
||||
if not options:
|
||||
raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build")
|
||||
_check_pixel_format(config.vcodec, config.pix_fmt)
|
||||
_check_codec_options(config.vcodec, config.get_codec_options(), config)
|
||||
@@ -22,7 +22,7 @@ import shutil
|
||||
import tempfile
|
||||
import threading
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
@@ -37,86 +37,14 @@ import torchvision
|
||||
from datasets.features.features import register_feature
|
||||
from PIL import Image
|
||||
|
||||
from lerobot.utils.import_utils import get_safe_default_codec
|
||||
from lerobot.configs.video import (
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
)
|
||||
from lerobot.utils.import_utils import get_safe_default_video_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build.
|
||||
# Determines the order of preference for auto-selection when vcodec="auto" is used.
|
||||
HW_ENCODERS = [
|
||||
"h264_videotoolbox", # macOS
|
||||
"hevc_videotoolbox", # macOS
|
||||
"h264_nvenc", # NVIDIA GPU
|
||||
"hevc_nvenc", # NVIDIA GPU
|
||||
"h264_vaapi", # Linux Intel/AMD
|
||||
"h264_qsv", # Intel Quick Sync
|
||||
]
|
||||
|
||||
VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS)
|
||||
|
||||
|
||||
def _get_codec_options(
|
||||
vcodec: str,
|
||||
g: int | None = 2,
|
||||
crf: int | None = 30,
|
||||
preset: int | None = None,
|
||||
) -> dict:
|
||||
"""Build codec-specific options dict for video encoding."""
|
||||
options = {}
|
||||
|
||||
# GOP size (keyframe interval) - supported by VideoToolbox and software encoders
|
||||
if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS):
|
||||
options["g"] = str(g)
|
||||
|
||||
# Quality control (codec-specific parameter names)
|
||||
if crf is not None:
|
||||
if vcodec in ("h264", "hevc", "libsvtav1"):
|
||||
options["crf"] = str(crf)
|
||||
elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
|
||||
quality = max(1, min(100, int(100 - crf * 2)))
|
||||
options["q:v"] = str(quality)
|
||||
elif vcodec in ("h264_nvenc", "hevc_nvenc"):
|
||||
options["rc"] = "constqp"
|
||||
options["qp"] = str(crf)
|
||||
elif vcodec in ("h264_vaapi",):
|
||||
options["qp"] = str(crf)
|
||||
elif vcodec in ("h264_qsv",):
|
||||
options["global_quality"] = str(crf)
|
||||
|
||||
# Preset (only for libsvtav1)
|
||||
if vcodec == "libsvtav1":
|
||||
options["preset"] = str(preset) if preset is not None else "12"
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def detect_available_hw_encoders() -> list[str]:
|
||||
"""Probe PyAV/FFmpeg for available hardware video encoders."""
|
||||
available = []
|
||||
for codec_name in HW_ENCODERS:
|
||||
try:
|
||||
av.codec.Codec(codec_name, "w")
|
||||
available.append(codec_name)
|
||||
except Exception: # nosec B110
|
||||
logger.debug("HW encoder '%s' not available", codec_name) # nosec B110
|
||||
return available
|
||||
|
||||
|
||||
def resolve_vcodec(vcodec: str) -> str:
|
||||
"""Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1."""
|
||||
if vcodec not in VALID_VIDEO_CODECS:
|
||||
raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
|
||||
if vcodec != "auto":
|
||||
logger.info(f"Using video codec: {vcodec}")
|
||||
return vcodec
|
||||
available = detect_available_hw_encoders()
|
||||
for encoder in HW_ENCODERS:
|
||||
if encoder in available:
|
||||
logger.info(f"Auto-selected video codec: {encoder}")
|
||||
return encoder
|
||||
logger.info("No hardware encoder available, falling back to software encoder 'libsvtav1'")
|
||||
return "libsvtav1"
|
||||
|
||||
|
||||
def decode_video_frames(
|
||||
video_path: Path | str,
|
||||
@@ -142,7 +70,7 @@ def decode_video_frames(
|
||||
Currently supports torchcodec on cpu and pyav.
|
||||
"""
|
||||
if backend is None:
|
||||
backend = get_safe_default_codec()
|
||||
backend = get_safe_default_video_backend()
|
||||
if backend == "torchcodec":
|
||||
return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8)
|
||||
elif backend in ["pyav", "video_reader"]:
|
||||
@@ -400,18 +328,17 @@ def encode_video_frames(
|
||||
imgs_dir: Path | str,
|
||||
video_path: Path | str,
|
||||
fps: int,
|
||||
vcodec: str = "libsvtav1",
|
||||
pix_fmt: str = "yuv420p",
|
||||
g: int | None = 2,
|
||||
crf: int | None = 30,
|
||||
fast_decode: int = 0,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
encoder_threads: int | None = None,
|
||||
*,
|
||||
log_level: int | None = av.logging.WARNING,
|
||||
overwrite: bool = False,
|
||||
preset: int | None = None,
|
||||
encoder_threads: int | None = None,
|
||||
) -> None:
|
||||
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
|
||||
vcodec = resolve_vcodec(vcodec)
|
||||
if camera_encoder_config is None:
|
||||
camera_encoder_config = camera_encoder_defaults()
|
||||
vcodec = camera_encoder_config.vcodec
|
||||
pix_fmt = camera_encoder_config.pix_fmt
|
||||
|
||||
video_path = Path(video_path)
|
||||
imgs_dir = Path(imgs_dir)
|
||||
@@ -422,42 +349,18 @@ def encode_video_frames(
|
||||
|
||||
video_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Encoders/pixel formats incompatibility check
|
||||
if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
|
||||
logger.warning(
|
||||
f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
|
||||
)
|
||||
pix_fmt = "yuv420p"
|
||||
|
||||
# Get input frames
|
||||
template = "frame-" + ("[0-9]" * 6) + ".png"
|
||||
input_list = sorted(
|
||||
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0])
|
||||
)
|
||||
|
||||
# Define video output frame size (assuming all input frames are the same size)
|
||||
if len(input_list) == 0:
|
||||
raise FileNotFoundError(f"No images found in {imgs_dir}.")
|
||||
with Image.open(input_list[0]) as dummy_image:
|
||||
width, height = dummy_image.size
|
||||
|
||||
# Define video codec options
|
||||
video_options = _get_codec_options(vcodec, g, crf, preset)
|
||||
|
||||
if fast_decode:
|
||||
key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
|
||||
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
|
||||
video_options[key] = value
|
||||
|
||||
if encoder_threads is not None:
|
||||
if vcodec == "libsvtav1":
|
||||
lp_param = f"lp={encoder_threads}"
|
||||
if "svtav1-params" in video_options:
|
||||
video_options["svtav1-params"] += f":{lp_param}"
|
||||
else:
|
||||
video_options["svtav1-params"] = lp_param
|
||||
else:
|
||||
video_options["threads"] = str(encoder_threads)
|
||||
video_options = camera_encoder_config.get_codec_options(encoder_threads, as_strings=True)
|
||||
|
||||
# Set logging level
|
||||
if log_level is not None:
|
||||
@@ -494,7 +397,10 @@ def encode_video_frames(
|
||||
|
||||
|
||||
def concatenate_video_files(
|
||||
input_video_paths: list[Path | str], output_video_path: Path, overwrite: bool = True
|
||||
input_video_paths: list[Path | str],
|
||||
output_video_path: Path,
|
||||
overwrite: bool = True,
|
||||
compatibility_check: bool = False,
|
||||
):
|
||||
"""
|
||||
Concatenate multiple video files into a single video file using pyav.
|
||||
@@ -507,6 +413,7 @@ def concatenate_video_files(
|
||||
input_video_paths: Ordered list of input video file paths to concatenate.
|
||||
output_video_path: Path to the output video file.
|
||||
overwrite: Whether to overwrite the output video file if it already exists. Default is True.
|
||||
compatibility_check: Whether to check if the input videos are compatible. Default is False.
|
||||
|
||||
Note:
|
||||
- Creates a temporary directory for intermediate files that is cleaned up after use.
|
||||
@@ -525,6 +432,22 @@ def concatenate_video_files(
|
||||
if len(input_video_paths) == 0:
|
||||
raise FileNotFoundError("No input video paths provided.")
|
||||
|
||||
# This check may be skipped at recording time as videos are encoded with the same encoder config.
|
||||
if compatibility_check:
|
||||
reference_video_info = get_video_info(input_video_paths[0])
|
||||
for input_path in input_video_paths[1:]:
|
||||
video_info = get_video_info(input_path)
|
||||
if (
|
||||
video_info["video.height"] != reference_video_info["video.height"]
|
||||
or video_info["video.width"] != reference_video_info["video.width"]
|
||||
or video_info["video.fps"] != reference_video_info["video.fps"]
|
||||
or video_info["video.codec"] != reference_video_info["video.codec"]
|
||||
or video_info["video.pix_fmt"] != reference_video_info["video.pix_fmt"]
|
||||
):
|
||||
raise ValueError(
|
||||
f"Input video {input_path} is not compatible with the reference video {input_video_paths[0]}."
|
||||
)
|
||||
|
||||
# Create a temporary .ffconcat file to list the input video paths
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".ffconcat", delete=False) as tmp_concatenate_file:
|
||||
tmp_concatenate_file.write("ffconcat version 1.0\n")
|
||||
@@ -591,26 +514,20 @@ class _CameraEncoderThread(threading.Thread):
|
||||
fps: int,
|
||||
vcodec: str,
|
||||
pix_fmt: str,
|
||||
g: int | None,
|
||||
crf: int | None,
|
||||
preset: int | None,
|
||||
codec_options: dict[str, str],
|
||||
frame_queue: queue.Queue,
|
||||
result_queue: queue.Queue,
|
||||
stop_event: threading.Event,
|
||||
encoder_threads: int | None = None,
|
||||
):
|
||||
super().__init__(daemon=True)
|
||||
self.video_path = video_path
|
||||
self.fps = fps
|
||||
self.vcodec = vcodec
|
||||
self.pix_fmt = pix_fmt
|
||||
self.g = g
|
||||
self.crf = crf
|
||||
self.preset = preset
|
||||
self.codec_options = codec_options
|
||||
self.frame_queue = frame_queue
|
||||
self.result_queue = result_queue
|
||||
self.stop_event = stop_event
|
||||
self.encoder_threads = encoder_threads
|
||||
|
||||
def run(self) -> None:
|
||||
from .compute_stats import RunningQuantileStats, auto_downsample_height_width
|
||||
@@ -646,19 +563,9 @@ class _CameraEncoderThread(threading.Thread):
|
||||
# Open container on first frame (to get width/height)
|
||||
if container is None:
|
||||
height, width = frame_data.shape[:2]
|
||||
video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset)
|
||||
if self.encoder_threads is not None:
|
||||
if self.vcodec == "libsvtav1":
|
||||
lp_param = f"lp={self.encoder_threads}"
|
||||
if "svtav1-params" in video_options:
|
||||
video_options["svtav1-params"] += f":{lp_param}"
|
||||
else:
|
||||
video_options["svtav1-params"] = lp_param
|
||||
else:
|
||||
video_options["threads"] = str(self.encoder_threads)
|
||||
Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
container = av.open(str(self.video_path), "w")
|
||||
output_stream = container.add_stream(self.vcodec, self.fps, options=video_options)
|
||||
output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options)
|
||||
output_stream.pix_fmt = self.pix_fmt
|
||||
output_stream.width = width
|
||||
output_stream.height = height
|
||||
@@ -724,22 +631,24 @@ class StreamingVideoEncoder:
|
||||
def __init__(
|
||||
self,
|
||||
fps: int,
|
||||
vcodec: str = "libsvtav1",
|
||||
pix_fmt: str = "yuv420p",
|
||||
g: int | None = 2,
|
||||
crf: int | None = 30,
|
||||
preset: int | None = None,
|
||||
camera_encoder_config: VideoEncoderConfig | None = None,
|
||||
queue_maxsize: int = 30,
|
||||
encoder_threads: int | None = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
fps: Frames per second for the output videos.
|
||||
camera_encoder_config: Video encoder settings applied to all cameras.
|
||||
When ``None``, :func:`camera_encoder_defaults` is used.
|
||||
encoder_threads: Number of encoder threads (global setting).
|
||||
``None`` lets the codec decide.
|
||||
queue_maxsize: Max frames to buffer per camera before
|
||||
back-pressure drops frames.
|
||||
"""
|
||||
self.fps = fps
|
||||
self.vcodec = resolve_vcodec(vcodec)
|
||||
self.pix_fmt = pix_fmt
|
||||
self.g = g
|
||||
self.crf = crf
|
||||
self.preset = preset
|
||||
self._camera_encoder_config = camera_encoder_config or camera_encoder_defaults()
|
||||
self._encoder_threads = encoder_threads
|
||||
self.queue_maxsize = queue_maxsize
|
||||
self.encoder_threads = encoder_threads
|
||||
|
||||
self._frame_queues: dict[str, queue.Queue] = {}
|
||||
self._result_queues: dict[str, queue.Queue] = {}
|
||||
@@ -770,18 +679,19 @@ class StreamingVideoEncoder:
|
||||
temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
|
||||
video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
|
||||
|
||||
vcodec = self._camera_encoder_config.vcodec
|
||||
codec_options = self._camera_encoder_config.get_codec_options(
|
||||
self._encoder_threads, as_strings=True
|
||||
)
|
||||
encoder_thread = _CameraEncoderThread(
|
||||
video_path=video_path,
|
||||
fps=self.fps,
|
||||
vcodec=self.vcodec,
|
||||
pix_fmt=self.pix_fmt,
|
||||
g=self.g,
|
||||
crf=self.crf,
|
||||
preset=self.preset,
|
||||
vcodec=vcodec,
|
||||
pix_fmt=self._camera_encoder_config.pix_fmt,
|
||||
codec_options=codec_options,
|
||||
frame_queue=frame_queue,
|
||||
result_queue=result_queue,
|
||||
stop_event=stop_event,
|
||||
encoder_threads=self.encoder_threads,
|
||||
)
|
||||
encoder_thread.start()
|
||||
|
||||
@@ -986,8 +896,18 @@ def get_audio_info(video_path: Path | str) -> dict:
|
||||
return audio_info
|
||||
|
||||
|
||||
def get_video_info(video_path: Path | str) -> dict:
|
||||
# Set logging level
|
||||
def get_video_info(
|
||||
video_path: Path | str,
|
||||
camera_encoder_config: "VideoEncoderConfig | None" = None,
|
||||
) -> dict:
|
||||
"""Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``.
|
||||
|
||||
Args:
|
||||
video_path: Path to the encoded video file to probe.
|
||||
camera_encoder_config: If provided, record the exact encoder settings used to encode this
|
||||
video. Stream-derived values take precedence — encoder fields are only written for keys
|
||||
not already populated from the video file itself.
|
||||
"""
|
||||
logging.getLogger("libav").setLevel(av.logging.WARNING)
|
||||
|
||||
# Getting video stream information
|
||||
@@ -1018,6 +938,14 @@ def get_video_info(video_path: Path | str) -> dict:
|
||||
# Adding audio stream information
|
||||
video_info.update(**get_audio_info(video_path))
|
||||
|
||||
# Add additional encoder configuration if provided
|
||||
if camera_encoder_config is not None:
|
||||
for field_name, field_value in asdict(camera_encoder_config).items():
|
||||
# vcodec is already populated from the video stream
|
||||
if field_name == "vcodec":
|
||||
continue
|
||||
video_info.setdefault(f"video.{field_name}", field_value)
|
||||
|
||||
return video_info
|
||||
|
||||
|
||||
|
||||
@@ -332,7 +332,7 @@ def build_rollout_context(
|
||||
cfg.dataset.repo_id,
|
||||
root=cfg.dataset.root,
|
||||
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
|
||||
vcodec=cfg.dataset.vcodec,
|
||||
camera_encoder_config=cfg.dataset.camera_encoder_config,
|
||||
streaming_encoding=cfg.dataset.streaming_encoding,
|
||||
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
@@ -367,7 +367,7 @@ def build_rollout_context(
|
||||
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera
|
||||
* len(robot.cameras if hasattr(robot, "cameras") else []),
|
||||
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
|
||||
vcodec=cfg.dataset.vcodec,
|
||||
camera_encoder_config=cfg.dataset.camera_encoder_config,
|
||||
streaming_encoding=cfg.dataset.streaming_encoding,
|
||||
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
|
||||
@@ -49,6 +49,14 @@ Delete episodes and save to a new dataset at a specific path and with a new repo
|
||||
--operation.type delete_episodes \
|
||||
--operation.episode_indices "[0, 2, 5]"
|
||||
|
||||
Delete episodes and re-encode video segments with h264:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht \
|
||||
--operation.type delete_episodes \
|
||||
--operation.episode_indices "[0, 2, 5]" \
|
||||
--operation.camera_encoder_config.vcodec h264 \
|
||||
--operation.camera_encoder_config.crf 23
|
||||
|
||||
Split dataset by fractions (pusht_train, pusht_val):
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht \
|
||||
@@ -74,6 +82,14 @@ Split into more than two splits:
|
||||
--operation.type split \
|
||||
--operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'
|
||||
|
||||
Split dataset and re-encode video segments with h264:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht \
|
||||
--operation.type split \
|
||||
--operation.splits '{"train": 0.8, "val": 0.2}' \
|
||||
--operation.camera_encoder_config.vcodec h264 \
|
||||
--operation.camera_encoder_config.crf 23
|
||||
|
||||
Merge multiple datasets:
|
||||
lerobot-edit-dataset \
|
||||
--new_repo_id lerobot/pusht_merged \
|
||||
@@ -187,7 +203,7 @@ import abc
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import draccus
|
||||
@@ -195,6 +211,8 @@ import draccus
|
||||
from lerobot.configs import parser
|
||||
from lerobot.datasets import (
|
||||
LeRobotDataset,
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
convert_image_to_video_dataset,
|
||||
delete_episodes,
|
||||
merge_datasets,
|
||||
@@ -218,12 +236,14 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC):
|
||||
@dataclass
|
||||
class DeleteEpisodesConfig(OperationConfig):
|
||||
episode_indices: list[int] | None = None
|
||||
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
|
||||
|
||||
@OperationConfig.register_subclass("split")
|
||||
@dataclass
|
||||
class SplitConfig(OperationConfig):
|
||||
splits: dict[str, float | list[int]] | None = None
|
||||
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
|
||||
|
||||
@OperationConfig.register_subclass("merge")
|
||||
@@ -250,11 +270,7 @@ class ModifyTasksConfig(OperationConfig):
|
||||
@dataclass
|
||||
class ConvertImageToVideoConfig(OperationConfig):
|
||||
output_dir: str | None = None
|
||||
vcodec: str = "libsvtav1"
|
||||
pix_fmt: str = "yuv420p"
|
||||
g: int = 2
|
||||
crf: int = 30
|
||||
fast_decode: int = 0
|
||||
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
episode_indices: list[int] | None = None
|
||||
num_workers: int = 4
|
||||
max_episodes_per_batch: int | None = None
|
||||
@@ -356,6 +372,7 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
|
||||
episode_indices=cfg.operation.episode_indices,
|
||||
output_dir=output_dir,
|
||||
repo_id=output_repo_id,
|
||||
camera_encoder_config=cfg.operation.camera_encoder_config,
|
||||
)
|
||||
|
||||
logging.info(f"Dataset saved to {output_dir}")
|
||||
@@ -387,6 +404,7 @@ def handle_split(cfg: EditDatasetConfig) -> None:
|
||||
dataset,
|
||||
splits=cfg.operation.splits,
|
||||
output_dir=cfg.new_root,
|
||||
camera_encoder_config=cfg.operation.camera_encoder_config,
|
||||
)
|
||||
|
||||
for split_name, split_ds in split_datasets.items():
|
||||
@@ -557,11 +575,8 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
|
||||
dataset=dataset,
|
||||
output_dir=output_dir,
|
||||
repo_id=output_repo_id,
|
||||
vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"),
|
||||
pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"),
|
||||
g=getattr(cfg.operation, "g", 2),
|
||||
crf=getattr(cfg.operation, "crf", 30),
|
||||
fast_decode=getattr(cfg.operation, "fast_decode", 0),
|
||||
camera_encoder_config=getattr(cfg.operation, "camera_encoder_config", None)
|
||||
or camera_encoder_defaults(),
|
||||
episode_indices=getattr(cfg.operation, "episode_indices", None),
|
||||
num_workers=getattr(cfg.operation, "num_workers", 4),
|
||||
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
|
||||
|
||||
@@ -63,6 +63,27 @@ lerobot-record \\
|
||||
--dataset.streaming_encoding=true \\
|
||||
--dataset.encoder_threads=2
|
||||
```
|
||||
|
||||
Example recording with custom video encoding parameters:
|
||||
```shell
|
||||
lerobot-record \\
|
||||
--robot.type=so100_follower \\
|
||||
--robot.port=/dev/tty.usbmodem58760431541 \\
|
||||
--robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\
|
||||
--robot.id=black \\
|
||||
--teleop.type=so100_leader \\
|
||||
--teleop.port=/dev/tty.usbmodem58760431551 \\
|
||||
--teleop.id=blue \\
|
||||
--dataset.repo_id=<my_username>/<my_dataset_name> \\
|
||||
--dataset.num_episodes=2 \\
|
||||
--dataset.single_task="Grab the cube" \\
|
||||
--dataset.streaming_encoding=true \\
|
||||
--dataset.encoder_threads=2 \\
|
||||
--dataset.camera_encoder_config.vcodec=h264 \\
|
||||
--dataset.camera_encoder_config.preset=fast \\
|
||||
--dataset.camera_encoder_config.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\
|
||||
--display_data=true
|
||||
```
|
||||
"""
|
||||
|
||||
import logging
|
||||
@@ -377,10 +398,10 @@ def record(
|
||||
cfg.dataset.repo_id,
|
||||
root=cfg.dataset.root,
|
||||
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
|
||||
vcodec=cfg.dataset.vcodec,
|
||||
camera_encoder_config=cfg.dataset.camera_encoder_config,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
streaming_encoding=cfg.dataset.streaming_encoding,
|
||||
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
|
||||
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
|
||||
if num_cameras > 0
|
||||
@@ -406,10 +427,10 @@ def record(
|
||||
image_writer_processes=cfg.dataset.num_image_writer_processes,
|
||||
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
|
||||
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
|
||||
vcodec=cfg.dataset.vcodec,
|
||||
camera_encoder_config=cfg.dataset.camera_encoder_config,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
streaming_encoding=cfg.dataset.streaming_encoding,
|
||||
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
)
|
||||
|
||||
robot.connect()
|
||||
@@ -420,7 +441,7 @@ def record(
|
||||
|
||||
if not cfg.dataset.streaming_encoding:
|
||||
logging.info(
|
||||
"Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
|
||||
"Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder_config.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
|
||||
)
|
||||
|
||||
with VideoEncodingManager(dataset):
|
||||
|
||||
@@ -69,7 +69,7 @@ def is_package_available(
|
||||
return package_exists
|
||||
|
||||
|
||||
def get_safe_default_codec():
|
||||
def get_safe_default_video_backend():
|
||||
logger = logging.getLogger(__name__)
|
||||
if importlib.util.find_spec("torchcodec"):
|
||||
return "torchcodec"
|
||||
@@ -128,6 +128,9 @@ _hidapi_available = is_package_available("hidapi", import_name="hid")
|
||||
_pandas_available = is_package_available("pandas")
|
||||
_faker_available = is_package_available("faker")
|
||||
|
||||
# Video encoding / decoding
|
||||
_av_available = is_package_available("av")
|
||||
|
||||
# Misc
|
||||
_pynput_available = is_package_available("pynput")
|
||||
_pygame_available = is_package_available("pygame")
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2191cd86e9e32ecbe18e33ad68d49060e479723ab5a3212bbb26df3025ccb568
|
||||
size 5815
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3e0ebf563ba3ed9c24b691a0f0b29e0294a1fa9b51422e1ece296155f1465768
|
||||
size 16236
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8475bfd5e6c4c780df46200e2b027e262b38436c57d01078bd943a5b87c65b8f
|
||||
size 20726
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6434322d1c671a7d132367619f841a775317cb9ff973f3f4505831e3ed74076d
|
||||
size 23808
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8efc84375e92a3499cef93100e04d8fb354670f3d9e0db2097b52575927284fc
|
||||
size 12237
|
||||
@@ -20,7 +20,7 @@ import pytest
|
||||
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
|
||||
|
||||
from lerobot.datasets.dataset_reader import DatasetReader
|
||||
from lerobot.utils.import_utils import get_safe_default_codec
|
||||
from lerobot.utils.import_utils import get_safe_default_video_backend
|
||||
|
||||
# ── Loading ──────────────────────────────────────────────────────────
|
||||
|
||||
@@ -35,7 +35,7 @@ def test_try_load_returns_true_when_data_exists(tmp_path, lerobot_dataset_factor
|
||||
root=dataset.root,
|
||||
episodes=None,
|
||||
tolerance_s=1e-4,
|
||||
video_backend=get_safe_default_codec(),
|
||||
video_backend=get_safe_default_video_backend(),
|
||||
delta_timestamps=None,
|
||||
image_transforms=None,
|
||||
)
|
||||
@@ -58,7 +58,7 @@ def test_try_load_returns_false_when_no_data(tmp_path):
|
||||
root=meta.root,
|
||||
episodes=None,
|
||||
tolerance_s=1e-4,
|
||||
video_backend=get_safe_default_codec(),
|
||||
video_backend=get_safe_default_video_backend(),
|
||||
delta_timestamps=None,
|
||||
image_transforms=None,
|
||||
)
|
||||
|
||||
@@ -25,6 +25,7 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da
|
||||
|
||||
from lerobot.datasets.dataset_tools import (
|
||||
add_features,
|
||||
convert_image_to_video_dataset,
|
||||
delete_episodes,
|
||||
merge_datasets,
|
||||
modify_features,
|
||||
@@ -32,7 +33,7 @@ from lerobot.datasets.dataset_tools import (
|
||||
remove_feature,
|
||||
split_dataset,
|
||||
)
|
||||
from lerobot.scripts.lerobot_edit_dataset import convert_image_to_video_dataset
|
||||
from lerobot.datasets.video_utils import VideoEncoderConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -1246,10 +1247,12 @@ def test_convert_image_to_video_dataset(tmp_path):
|
||||
dataset=source_dataset,
|
||||
output_dir=output_dir,
|
||||
repo_id="lerobot/pusht_video",
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
camera_encoder_config=VideoEncoderConfig(
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
),
|
||||
episode_indices=[0, 1],
|
||||
num_workers=2,
|
||||
)
|
||||
|
||||
@@ -28,6 +28,7 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da
|
||||
from lerobot.datasets.dataset_writer import _encode_video_worker
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.utils import DEFAULT_IMAGE_PATH
|
||||
from lerobot.datasets.video_utils import VideoEncoderConfig
|
||||
from tests.fixtures.constants import DEFAULT_FPS, DUMMY_REPO_ID
|
||||
|
||||
SIMPLE_FEATURES = {
|
||||
@@ -52,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
|
||||
# ── Existing encode_video_worker tests ───────────────────────────────
|
||||
|
||||
|
||||
def test_encode_video_worker_forwards_vcodec(tmp_path):
|
||||
"""_encode_video_worker correctly forwards the vcodec parameter."""
|
||||
def test_encode_video_worker_forwards_camera_encoder_config(tmp_path):
|
||||
"""_encode_video_worker forwards camera_encoder_config to encode_video_frames."""
|
||||
video_key = "observation.images.laptop"
|
||||
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
|
||||
img_dir = tmp_path / Path(fpath).parent
|
||||
@@ -68,13 +69,21 @@ def test_encode_video_worker_forwards_vcodec(tmp_path):
|
||||
Path(video_path).touch()
|
||||
|
||||
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
|
||||
_encode_video_worker(video_key, 0, tmp_path, fps=30, vcodec="h264")
|
||||
_encode_video_worker(
|
||||
video_key,
|
||||
0,
|
||||
tmp_path,
|
||||
fps=30,
|
||||
camera_encoder_config=VideoEncoderConfig(vcodec="h264", preset=None),
|
||||
encoder_threads=4,
|
||||
)
|
||||
|
||||
assert captured_kwargs["vcodec"] == "h264"
|
||||
assert captured_kwargs["camera_encoder_config"].vcodec == "h264"
|
||||
assert captured_kwargs["encoder_threads"] == 4
|
||||
|
||||
|
||||
def test_encode_video_worker_default_vcodec(tmp_path):
|
||||
"""_encode_video_worker uses libsvtav1 as the default codec."""
|
||||
def test_encode_video_worker_default_camera_encoder_config(tmp_path):
|
||||
"""_encode_video_worker passes None camera_encoder_config which encode_video_frames defaults."""
|
||||
video_key = "observation.images.laptop"
|
||||
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
|
||||
img_dir = tmp_path / Path(fpath).parent
|
||||
@@ -91,7 +100,8 @@ def test_encode_video_worker_default_vcodec(tmp_path):
|
||||
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
|
||||
_encode_video_worker(video_key, 0, tmp_path, fps=30)
|
||||
|
||||
assert captured_kwargs["vcodec"] == "libsvtav1"
|
||||
assert captured_kwargs["camera_encoder_config"] is None
|
||||
assert captured_kwargs["encoder_threads"] is None
|
||||
|
||||
|
||||
# ── add_frame contracts ──────────────────────────────────────────────
|
||||
|
||||
@@ -31,6 +31,7 @@ from torchvision.transforms import v2
|
||||
|
||||
from lerobot.configs.default import DatasetConfig
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
from lerobot.configs.video import VALID_VIDEO_CODECS, VideoEncoderConfig
|
||||
from lerobot.datasets import make_dataset
|
||||
from lerobot.datasets.feature_utils import get_hf_features_from_features
|
||||
from lerobot.datasets.image_writer import image_array_to_pil_image
|
||||
@@ -43,7 +44,6 @@ from lerobot.datasets.utils import (
|
||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
create_branch,
|
||||
)
|
||||
from lerobot.datasets.video_utils import VALID_VIDEO_CODECS
|
||||
from lerobot.envs.factory import make_env_config
|
||||
from lerobot.policies.factory import make_policy_config
|
||||
from lerobot.robots import make_robot_from_config
|
||||
@@ -1470,17 +1470,9 @@ def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_fact
|
||||
|
||||
|
||||
def test_lerobot_dataset_vcodec_validation():
|
||||
"""Test that LeRobotDataset validates the vcodec parameter."""
|
||||
# Test that invalid vcodec raises ValueError
|
||||
"""Invalid vcodec in encoder config is rejected at construction time."""
|
||||
with pytest.raises(ValueError, match="Invalid vcodec"):
|
||||
LeRobotDataset.__new__(LeRobotDataset) # bypass __init__ to test validation directly
|
||||
# Actually test via create since it's easier
|
||||
LeRobotDataset.create(
|
||||
repo_id="test/invalid_codec",
|
||||
fps=30,
|
||||
features={"observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}},
|
||||
vcodec="invalid_codec",
|
||||
)
|
||||
VideoEncoderConfig(vcodec="invalid_codec")
|
||||
|
||||
|
||||
def test_valid_video_codecs_constant():
|
||||
|
||||
@@ -14,11 +14,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for streaming video encoding and hardware-accelerated encoding."""
|
||||
"""Tests for streaming video encoding."""
|
||||
|
||||
import queue
|
||||
import threading
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@@ -27,112 +26,20 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
|
||||
|
||||
import av # noqa: E402
|
||||
|
||||
from lerobot.datasets.pyav_utils import get_codec
|
||||
from lerobot.datasets.video_utils import (
|
||||
VALID_VIDEO_CODECS,
|
||||
StreamingVideoEncoder,
|
||||
VideoEncoderConfig,
|
||||
_CameraEncoderThread,
|
||||
_get_codec_options,
|
||||
detect_available_hw_encoders,
|
||||
resolve_vcodec,
|
||||
)
|
||||
from lerobot.utils.constants import OBS_IMAGES
|
||||
|
||||
# ─── _get_codec_options tests ───
|
||||
|
||||
|
||||
class TestGetCodecOptions:
|
||||
def test_libsvtav1_defaults(self):
|
||||
opts = _get_codec_options("libsvtav1")
|
||||
assert opts["g"] == "2"
|
||||
assert opts["crf"] == "30"
|
||||
assert opts["preset"] == "12"
|
||||
|
||||
def test_libsvtav1_custom_preset(self):
|
||||
opts = _get_codec_options("libsvtav1", preset=8)
|
||||
assert opts["preset"] == "8"
|
||||
|
||||
def test_h264_options(self):
|
||||
opts = _get_codec_options("h264", g=10, crf=23)
|
||||
assert opts["g"] == "10"
|
||||
assert opts["crf"] == "23"
|
||||
assert "preset" not in opts
|
||||
|
||||
def test_videotoolbox_options(self):
|
||||
opts = _get_codec_options("h264_videotoolbox", g=2, crf=30)
|
||||
assert opts["g"] == "2"
|
||||
# CRF 30 maps to quality = max(1, min(100, 100 - 30*2)) = 40
|
||||
assert opts["q:v"] == "40"
|
||||
assert "crf" not in opts
|
||||
|
||||
def test_nvenc_options(self):
|
||||
opts = _get_codec_options("h264_nvenc", g=2, crf=25)
|
||||
assert opts["rc"] == "constqp"
|
||||
assert opts["qp"] == "25"
|
||||
assert "crf" not in opts
|
||||
# NVENC doesn't support g
|
||||
assert "g" not in opts
|
||||
|
||||
def test_vaapi_options(self):
|
||||
opts = _get_codec_options("h264_vaapi", crf=28)
|
||||
assert opts["qp"] == "28"
|
||||
|
||||
def test_qsv_options(self):
|
||||
opts = _get_codec_options("h264_qsv", crf=25)
|
||||
assert opts["global_quality"] == "25"
|
||||
|
||||
def test_no_g_no_crf(self):
|
||||
opts = _get_codec_options("h264", g=None, crf=None)
|
||||
assert "g" not in opts
|
||||
assert "crf" not in opts
|
||||
|
||||
|
||||
# ─── HW encoder detection tests ───
|
||||
|
||||
|
||||
class TestHWEncoderDetection:
|
||||
def test_detect_available_hw_encoders_returns_list(self):
|
||||
result = detect_available_hw_encoders()
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_detect_available_hw_encoders_only_valid(self):
|
||||
from lerobot.datasets.video_utils import HW_ENCODERS
|
||||
|
||||
result = detect_available_hw_encoders()
|
||||
for encoder in result:
|
||||
assert encoder in HW_ENCODERS
|
||||
|
||||
def test_resolve_vcodec_passthrough(self):
|
||||
assert resolve_vcodec("libsvtav1") == "libsvtav1"
|
||||
assert resolve_vcodec("h264") == "h264"
|
||||
|
||||
def test_resolve_vcodec_auto_fallback(self):
|
||||
"""When no HW encoders are available, auto should fall back to libsvtav1."""
|
||||
with patch("lerobot.datasets.video_utils.detect_available_hw_encoders", return_value=[]):
|
||||
assert resolve_vcodec("auto") == "libsvtav1"
|
||||
|
||||
def test_resolve_vcodec_auto_picks_hw(self):
|
||||
"""When a HW encoder is available, auto should pick it."""
|
||||
with patch(
|
||||
"lerobot.datasets.video_utils.detect_available_hw_encoders",
|
||||
return_value=["h264_videotoolbox"],
|
||||
):
|
||||
assert resolve_vcodec("auto") == "h264_videotoolbox"
|
||||
|
||||
def test_resolve_vcodec_auto_returns_valid(self):
|
||||
"""Test that resolve_vcodec('auto') returns a known valid codec."""
|
||||
result = resolve_vcodec("auto")
|
||||
assert result in VALID_VIDEO_CODECS
|
||||
|
||||
def test_hw_encoder_names_accepted_in_validation(self):
|
||||
"""Test that HW encoder names pass validation in VALID_VIDEO_CODECS."""
|
||||
assert "auto" in VALID_VIDEO_CODECS
|
||||
assert "h264_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "h264_nvenc" in VALID_VIDEO_CODECS
|
||||
|
||||
def test_resolve_vcodec_invalid_raises(self):
|
||||
"""Test that resolve_vcodec raises ValueError for invalid codecs."""
|
||||
with pytest.raises(ValueError, match="Invalid vcodec"):
|
||||
resolve_vcodec("not_a_real_codec")
|
||||
# Cross-codec validation tests only fire when the target codec is present
|
||||
# in the local FFmpeg build; on other platforms validate() is a no-op.
|
||||
_has_videotoolbox = get_codec("h264_videotoolbox") is not None
|
||||
_videotoolbox_only = pytest.mark.skipif(
|
||||
not _has_videotoolbox, reason="h264_videotoolbox not in local FFmpeg build"
|
||||
)
|
||||
|
||||
|
||||
# ─── _CameraEncoderThread tests ───
|
||||
@@ -150,14 +57,13 @@ class TestCameraEncoderThread:
|
||||
result_queue: queue.Queue = queue.Queue(maxsize=1)
|
||||
stop_event = threading.Event()
|
||||
|
||||
enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
|
||||
encoder_thread = _CameraEncoderThread(
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
preset=13,
|
||||
vcodec=enc_cfg.vcodec,
|
||||
pix_fmt=enc_cfg.pix_fmt,
|
||||
codec_options=enc_cfg.get_codec_options(as_strings=True),
|
||||
frame_queue=frame_queue,
|
||||
result_queue=result_queue,
|
||||
stop_event=stop_event,
|
||||
@@ -202,14 +108,13 @@ class TestCameraEncoderThread:
|
||||
result_queue: queue.Queue = queue.Queue(maxsize=1)
|
||||
stop_event = threading.Event()
|
||||
|
||||
enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
|
||||
encoder_thread = _CameraEncoderThread(
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
preset=13,
|
||||
vcodec=enc_cfg.vcodec,
|
||||
pix_fmt=enc_cfg.pix_fmt,
|
||||
codec_options=enc_cfg.get_codec_options(as_strings=True),
|
||||
frame_queue=frame_queue,
|
||||
result_queue=result_queue,
|
||||
stop_event=stop_event,
|
||||
@@ -237,14 +142,13 @@ class TestCameraEncoderThread:
|
||||
result_queue: queue.Queue = queue.Queue(maxsize=1)
|
||||
stop_event = threading.Event()
|
||||
|
||||
enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
|
||||
encoder_thread = _CameraEncoderThread(
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
preset=13,
|
||||
vcodec=enc_cfg.vcodec,
|
||||
pix_fmt=enc_cfg.pix_fmt,
|
||||
codec_options=enc_cfg.get_codec_options(as_strings=True),
|
||||
frame_queue=frame_queue,
|
||||
result_queue=result_queue,
|
||||
stop_event=stop_event,
|
||||
@@ -266,11 +170,20 @@ class TestCameraEncoderThread:
|
||||
|
||||
|
||||
class TestStreamingVideoEncoder:
|
||||
def _make_encoder_config(self, **kwargs):
|
||||
"""Helper to build a VideoEncoderConfig."""
|
||||
return VideoEncoderConfig(**kwargs)
|
||||
|
||||
def test_single_camera_episode(self, tmp_path):
|
||||
"""Test encoding a single camera episode."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
|
||||
|
||||
video_keys = [f"{OBS_IMAGES}.laptop"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
|
||||
),
|
||||
)
|
||||
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
num_frames = 20
|
||||
@@ -295,9 +208,13 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_multi_camera_episode(self, tmp_path):
|
||||
"""Test encoding multiple cameras simultaneously."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
|
||||
|
||||
video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30
|
||||
),
|
||||
)
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
num_frames = 15
|
||||
@@ -319,8 +236,13 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_sequential_episodes(self, tmp_path):
|
||||
"""Test that multiple sequential episodes work correctly."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
|
||||
video_keys = [f"{OBS_IMAGES}.cam"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30
|
||||
),
|
||||
)
|
||||
|
||||
for ep in range(3):
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
@@ -342,8 +264,13 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_cancel_episode(self, tmp_path):
|
||||
"""Test that canceling an episode cleans up properly."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
|
||||
video_keys = [f"{OBS_IMAGES}.cam"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30
|
||||
),
|
||||
)
|
||||
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
@@ -365,28 +292,33 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_feed_without_start_raises(self, tmp_path):
|
||||
"""Test that feeding frames without starting an episode raises."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
|
||||
encoder = StreamingVideoEncoder(fps=30)
|
||||
with pytest.raises(RuntimeError, match="No active episode"):
|
||||
encoder.feed_frame("cam", np.zeros((64, 96, 3), dtype=np.uint8))
|
||||
encoder.close()
|
||||
|
||||
def test_finish_without_start_raises(self, tmp_path):
|
||||
"""Test that finishing without starting raises."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
|
||||
encoder = StreamingVideoEncoder(fps=30)
|
||||
with pytest.raises(RuntimeError, match="No active episode"):
|
||||
encoder.finish_episode()
|
||||
encoder.close()
|
||||
|
||||
def test_close_is_idempotent(self, tmp_path):
|
||||
"""Test that close() can be called multiple times safely."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
|
||||
encoder = StreamingVideoEncoder(fps=30)
|
||||
encoder.close()
|
||||
encoder.close() # Should not raise
|
||||
|
||||
def test_video_duration_matches_frame_count(self, tmp_path):
|
||||
"""Test that encoded video duration matches num_frames / fps."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
|
||||
video_keys = [f"{OBS_IMAGES}.cam"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
|
||||
),
|
||||
)
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
num_frames = 90 # 3 seconds at 30fps
|
||||
@@ -417,9 +349,13 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_multi_camera_start_episode_called_once(self, tmp_path):
|
||||
"""Test that with multiple cameras, no frames are lost due to double start_episode."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
|
||||
|
||||
video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30
|
||||
),
|
||||
)
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
num_frames = 30
|
||||
@@ -446,17 +382,24 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_encoder_threads_passed_to_thread(self, tmp_path):
|
||||
"""Test that encoder_threads is stored and passed through to encoder threads."""
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, encoder_threads=2
|
||||
)
|
||||
assert encoder.encoder_threads == 2
|
||||
|
||||
video_keys = [f"{OBS_IMAGES}.cam"]
|
||||
cfg = VideoEncoderConfig(
|
||||
vcodec="libsvtav1",
|
||||
pix_fmt="yuv420p",
|
||||
g=2,
|
||||
crf=30,
|
||||
)
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=cfg,
|
||||
encoder_threads=2,
|
||||
)
|
||||
assert encoder._encoder_threads == 2
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
# Verify the thread received the encoder_threads value
|
||||
# Verify codec options include thread tuning for libsvtav1 (lp=…)
|
||||
thread = encoder._threads[f"{OBS_IMAGES}.cam"]
|
||||
assert thread.encoder_threads == 2
|
||||
assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options
|
||||
|
||||
# Feed some frames and finish to ensure it works end-to-end
|
||||
num_frames = 10
|
||||
@@ -478,16 +421,20 @@ class TestStreamingVideoEncoder:
|
||||
|
||||
def test_encoder_threads_none_by_default(self, tmp_path):
|
||||
"""Test that encoder_threads defaults to None (codec auto-detect)."""
|
||||
encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
|
||||
assert encoder.encoder_threads is None
|
||||
encoder = StreamingVideoEncoder(fps=30)
|
||||
assert encoder._encoder_threads is None
|
||||
encoder.close()
|
||||
|
||||
def test_graceful_frame_dropping(self, tmp_path):
|
||||
"""Test that full queue drops frames instead of crashing."""
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13, queue_maxsize=1
|
||||
)
|
||||
video_keys = [f"{OBS_IMAGES}.cam"]
|
||||
encoder = StreamingVideoEncoder(
|
||||
fps=30,
|
||||
camera_encoder_config=self._make_encoder_config(
|
||||
vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
|
||||
),
|
||||
queue_maxsize=1,
|
||||
)
|
||||
encoder.start_episode(video_keys, tmp_path)
|
||||
|
||||
# Feed many frames quickly - with queue_maxsize=1, some will be dropped
|
||||
|
||||
@@ -0,0 +1,572 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Unit tests for ``lerobot.datasets.video_utils`` encoding functions and ``VideoEncoderConfig`` config class."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
|
||||
|
||||
import av # noqa: E402
|
||||
|
||||
from lerobot.configs.video import VALID_VIDEO_CODECS, VideoEncoderConfig
|
||||
from lerobot.datasets.image_writer import write_image
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.pyav_utils import get_codec
|
||||
from lerobot.datasets.utils import INFO_PATH
|
||||
from lerobot.datasets.video_utils import (
|
||||
concatenate_video_files,
|
||||
encode_video_frames,
|
||||
get_video_info,
|
||||
)
|
||||
|
||||
|
||||
# Per-codec skip markers — validation tests only fire when the codec is available
|
||||
def _require_encoder(vcodec: str) -> pytest.MarkDecorator:
|
||||
"""Skip the test if ``vcodec`` is not available in the local FFmpeg build."""
|
||||
return pytest.mark.skipif(get_codec(vcodec) is None, reason=f"{vcodec!r} not in local FFmpeg build")
|
||||
|
||||
|
||||
require_libsvtav1 = _require_encoder("libsvtav1")
|
||||
require_h264 = _require_encoder("h264")
|
||||
require_videotoolbox = _require_encoder("h264_videotoolbox")
|
||||
require_nvenc = _require_encoder("h264_nvenc")
|
||||
require_vaapi = _require_encoder("h264_vaapi")
|
||||
require_qsv = _require_encoder("h264_qsv")
|
||||
|
||||
|
||||
# ─── VideoEncoderConfig / codec options ──────────────────────────────
|
||||
|
||||
|
||||
class TestCodecOptions:
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_defaults(self):
|
||||
cfg = VideoEncoderConfig()
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["g"] == 2
|
||||
assert opts["crf"] == 30
|
||||
assert opts["preset"] == 12
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_custom_preset(self):
|
||||
cfg = VideoEncoderConfig(preset=8)
|
||||
assert cfg.get_codec_options()["preset"] == 8
|
||||
|
||||
@require_h264
|
||||
def test_h264_options(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None)
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["g"] == 10
|
||||
assert opts["crf"] == 23
|
||||
assert "preset" not in opts
|
||||
|
||||
@require_videotoolbox
|
||||
def test_videotoolbox_options(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None)
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["g"] == 2
|
||||
assert opts["q:v"] == 40
|
||||
assert "crf" not in opts
|
||||
|
||||
@_require_encoder("h264_nvenc")
|
||||
def test_nvenc_options(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None)
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["rc"] == "constqp"
|
||||
assert opts["qp"] == 25
|
||||
assert "crf" not in opts
|
||||
assert "g" not in opts
|
||||
|
||||
@_require_encoder("h264_vaapi")
|
||||
def test_vaapi_options(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None)
|
||||
assert cfg.get_codec_options()["qp"] == 28
|
||||
|
||||
@_require_encoder("h264_qsv")
|
||||
def test_qsv_options(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None)
|
||||
assert cfg.get_codec_options()["global_quality"] == 25
|
||||
|
||||
@require_h264
|
||||
def test_no_g_no_crf(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None)
|
||||
opts = cfg.get_codec_options()
|
||||
assert "g" not in opts
|
||||
assert "crf" not in opts
|
||||
|
||||
@require_libsvtav1
|
||||
def test_encoder_threads_libsvtav1(self):
|
||||
cfg = VideoEncoderConfig(fast_decode=0)
|
||||
opts = cfg.get_codec_options(encoder_threads=4)
|
||||
assert "lp=4" in opts.get("svtav1-params", "")
|
||||
|
||||
@require_h264
|
||||
def test_encoder_threads_h264(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264", preset=None)
|
||||
assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2
|
||||
|
||||
@require_libsvtav1
|
||||
def test_fast_decode_libsvtav1(self):
|
||||
cfg = VideoEncoderConfig(fast_decode=1)
|
||||
opts = cfg.get_codec_options()
|
||||
assert "fast-decode=1" in opts.get("svtav1-params", "")
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_fast_decode_clamped_to_svt_range(self):
|
||||
"""Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode)."""
|
||||
cfg = VideoEncoderConfig(fast_decode=100)
|
||||
assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "")
|
||||
cfg_neg = VideoEncoderConfig(fast_decode=-5)
|
||||
assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "")
|
||||
|
||||
@require_h264
|
||||
def test_fast_decode_h264(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None)
|
||||
assert cfg.get_codec_options()["tune"] == "fastdecode"
|
||||
|
||||
@require_libsvtav1
|
||||
def test_pix_fmt_unsupported_raises(self):
|
||||
"""Passing an unsupported pix_fmt is a hard error."""
|
||||
with pytest.raises(ValueError, match="pix_fmt"):
|
||||
VideoEncoderConfig(pix_fmt="yuv444p") # libsvtav1 only supports yuv420p variants
|
||||
|
||||
@require_libsvtav1
|
||||
@require_h264
|
||||
def test_preset_default_behaviour(self):
|
||||
"""Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None."""
|
||||
assert VideoEncoderConfig().preset == 12
|
||||
assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12
|
||||
assert VideoEncoderConfig(vcodec="h264").preset is None
|
||||
assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None
|
||||
|
||||
@require_h264
|
||||
def test_preset_string_on_h264(self):
|
||||
"""h264 accepts string presets and forwards them to FFmpeg."""
|
||||
cfg = VideoEncoderConfig(vcodec="h264", preset="slow")
|
||||
assert cfg.get_codec_options()["preset"] == "slow"
|
||||
|
||||
@require_videotoolbox
|
||||
def test_preset_on_videotoolbox_not_set(self):
|
||||
"""videotoolbox has no preset option at all."""
|
||||
cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow")
|
||||
assert "preset" not in cfg.get_codec_options()
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_preset_out_of_range_raises(self):
|
||||
"""libsvtav1 preset must sit in [-2, 13] as exposed by PyAV."""
|
||||
with pytest.raises(ValueError, match="out of range"):
|
||||
VideoEncoderConfig(vcodec="libsvtav1", preset=100)
|
||||
with pytest.raises(ValueError, match="out of range"):
|
||||
VideoEncoderConfig(vcodec="libsvtav1", preset=-3)
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_crf_out_of_range_raises(self):
|
||||
"""libsvtav1 crf must sit in [0, 63]."""
|
||||
with pytest.raises(ValueError, match="crf.*out of range"):
|
||||
VideoEncoderConfig(vcodec="libsvtav1", crf=64)
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_crf_rejects_python_float(self):
|
||||
"""libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation."""
|
||||
with pytest.raises(ValueError, match="float values are not allowed"):
|
||||
VideoEncoderConfig(vcodec="libsvtav1", crf=2.5)
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_extra_crf_rejects_fractional_string(self):
|
||||
"""INT options reject fractional values even when supplied only via ``extra_options``."""
|
||||
with pytest.raises(ValueError, match="float values are not allowed"):
|
||||
VideoEncoderConfig(
|
||||
vcodec="libsvtav1",
|
||||
crf=None,
|
||||
extra_options={"crf": "2.5"},
|
||||
)
|
||||
|
||||
@require_libsvtav1
|
||||
def test_libsvtav1_extra_crf_rejects_float(self):
|
||||
with pytest.raises(ValueError, match="float values are not allowed"):
|
||||
VideoEncoderConfig(
|
||||
vcodec="libsvtav1",
|
||||
crf=None,
|
||||
extra_options={"crf": 2.5},
|
||||
)
|
||||
|
||||
@require_h264
|
||||
def test_h264_crf_accepts_float_and_int(self):
|
||||
"""x264 exposes crf as a FLOAT option, so both int and float are accepted."""
|
||||
assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23
|
||||
assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5
|
||||
|
||||
@require_libsvtav1
|
||||
def test_validate_is_rerunnable(self):
|
||||
"""After mutating a field, validate() re-checks and surfaces new issues."""
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1")
|
||||
cfg.preset = 100 # now out of range
|
||||
with pytest.raises(ValueError, match="out of range"):
|
||||
cfg.validate()
|
||||
|
||||
|
||||
class TestExtraOptions:
|
||||
@require_libsvtav1
|
||||
def test_default_is_empty_dict(self):
|
||||
cfg = VideoEncoderConfig()
|
||||
assert cfg.extra_options == {}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_unknown_key_passes_through(self):
|
||||
"""Keys not published as AVOptions are forwarded to FFmpeg."""
|
||||
cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"})
|
||||
assert cfg.extra_options == {"totally_made_up_option": "value"}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_numeric_value_in_range_ok(self):
|
||||
"""libsvtav1 exposes ``qp`` as INT in [0, 63]."""
|
||||
cfg = VideoEncoderConfig(extra_options={"qp": 30})
|
||||
assert cfg.extra_options == {"qp": 30}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_numeric_out_of_range_raises(self):
|
||||
with pytest.raises(ValueError, match=r"extra_options\['qp'\].*out of range"):
|
||||
VideoEncoderConfig(extra_options={"qp": 999})
|
||||
|
||||
@require_libsvtav1
|
||||
def test_numeric_string_accepted_in_range(self):
|
||||
"""Numeric strings are accepted for numeric options (mirrors FFmpeg)."""
|
||||
cfg = VideoEncoderConfig(extra_options={"qp": "18"})
|
||||
assert cfg.extra_options == {"qp": "18"}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_numeric_string_out_of_range_raises(self):
|
||||
with pytest.raises(ValueError, match=r"extra_options\['qp'\].*out of range"):
|
||||
VideoEncoderConfig(extra_options={"qp": "999"})
|
||||
|
||||
@require_libsvtav1
|
||||
def test_non_numeric_string_on_numeric_option_raises(self):
|
||||
with pytest.raises(ValueError, match=r"extra_options\['qp'\].*not numeric"):
|
||||
VideoEncoderConfig(extra_options={"qp": "medium"})
|
||||
|
||||
@require_libsvtav1
|
||||
def test_bool_on_numeric_option_raises(self):
|
||||
"""``bool`` is explicitly rejected for numeric options."""
|
||||
with pytest.raises(ValueError, match=r"extra_options\['qp'\].*not numeric"):
|
||||
VideoEncoderConfig(extra_options={"qp": True})
|
||||
|
||||
@require_h264
|
||||
def test_string_option_passes_through_unchecked(self):
|
||||
"""String-typed AVOptions are NOT enum-checked (too many accept freeform)."""
|
||||
cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"})
|
||||
assert cfg.extra_options == {"tune": "some-future-tune"}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_merged_into_codec_options_and_stringified(self):
|
||||
"""Typed merge by default; ``as_strings=True`` matches FFmpeg option dict."""
|
||||
cfg = VideoEncoderConfig(extra_options={"qp": 20})
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["qp"] == 20
|
||||
assert isinstance(opts["qp"], int)
|
||||
assert cfg.get_codec_options(as_strings=True)["qp"] == "20"
|
||||
|
||||
@require_libsvtav1
|
||||
def test_structured_fields_win_on_collision(self):
|
||||
"""A colliding extra_options key is discarded; the structured field wins."""
|
||||
cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18})
|
||||
assert cfg.get_codec_options()["crf"] == 30
|
||||
|
||||
|
||||
class TestEncoderDetection:
|
||||
@require_h264
|
||||
def test_explicit_codec_kept_when_available(self):
|
||||
cfg = VideoEncoderConfig(vcodec="h264")
|
||||
assert cfg.vcodec == "h264"
|
||||
|
||||
@require_videotoolbox
|
||||
def test_auto_picks_videotoolbox_when_available(self):
|
||||
"""``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present."""
|
||||
cfg = VideoEncoderConfig(vcodec="auto")
|
||||
assert cfg.vcodec == "h264_videotoolbox"
|
||||
|
||||
def test_invalid_codec_raises(self):
|
||||
with pytest.raises(ValueError, match="Invalid vcodec"):
|
||||
VideoEncoderConfig(vcodec="not_a_real_codec")
|
||||
|
||||
def test_hw_encoder_names_listed_as_valid(self):
|
||||
assert "auto" in VALID_VIDEO_CODECS
|
||||
assert "h264_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "h264_nvenc" in VALID_VIDEO_CODECS
|
||||
|
||||
|
||||
TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"
|
||||
|
||||
# Default video feature set used by persistence tests.
|
||||
VIDEO_FEATURES = {
|
||||
"observation.images.cam": {
|
||||
"dtype": "video",
|
||||
"shape": (64, 96, 3),
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
|
||||
}
|
||||
VIDEO_KEY = "observation.images.cam"
|
||||
|
||||
|
||||
def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
|
||||
imgs_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i in range(num_frames):
|
||||
arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
|
||||
write_image(arr, imgs_dir / f"frame-{i:06d}.png")
|
||||
|
||||
|
||||
def _encode_video(
|
||||
path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None
|
||||
) -> Path:
|
||||
imgs_dir = path.parent / f"imgs_{path.stem}"
|
||||
_write_frames(imgs_dir, num_frames=num_frames)
|
||||
encode_video_frames(imgs_dir, path, fps=fps, camera_encoder_config=cfg, overwrite=True)
|
||||
return path
|
||||
|
||||
|
||||
def _read_feature_info(dataset: LeRobotDataset) -> dict:
|
||||
info = json.loads((dataset.root / INFO_PATH).read_text())
|
||||
return info["features"][VIDEO_KEY]["info"]
|
||||
|
||||
|
||||
def _add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
|
||||
shape = dataset.meta.features[VIDEO_KEY]["shape"]
|
||||
for _ in range(num_frames):
|
||||
dataset.add_frame(
|
||||
{
|
||||
VIDEO_KEY: np.random.randint(0, 256, shape, dtype=np.uint8),
|
||||
"action": np.zeros(2, dtype=np.float32),
|
||||
"task": "test",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class TestGetVideoInfo:
|
||||
def test_returns_all_stream_fields(self):
|
||||
info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4")
|
||||
|
||||
assert info["video.height"] == 64
|
||||
assert info["video.width"] == 96
|
||||
assert info["video.pix_fmt"] == "yuv420p"
|
||||
assert info["video.fps"] == 30
|
||||
assert info["video.channels"] == 3
|
||||
assert info["video.is_depth_map"] is False
|
||||
assert info["has_audio"] is False
|
||||
assert "video.g" not in info
|
||||
assert "video.crf" not in info
|
||||
assert "video.preset" not in info
|
||||
|
||||
@require_libsvtav1
|
||||
def test_merges_encoder_config_as_video_prefixed_entries(self):
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
|
||||
|
||||
info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder_config=cfg)
|
||||
|
||||
assert info["video.g"] == 2
|
||||
assert info["video.crf"] == 30
|
||||
assert info["video.preset"] == 12
|
||||
assert info["video.fast_decode"] == 0
|
||||
assert info["video.video_backend"] == "pyav"
|
||||
assert info["video.extra_options"] == {}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_stream_derived_keys_take_precedence_over_config(self):
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p")
|
||||
|
||||
info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder_config=cfg)
|
||||
|
||||
assert info["video.codec"] # populated from stream, not from config's vcodec
|
||||
assert info["video.pix_fmt"] == "yuv420p"
|
||||
|
||||
|
||||
class TestEncodeVideoFrames:
|
||||
@require_libsvtav1
|
||||
def test_produces_readable_mp4(self, tmp_path):
|
||||
video_path = _encode_video(tmp_path / "out.mp4")
|
||||
|
||||
assert video_path.exists()
|
||||
info = get_video_info(video_path)
|
||||
assert info["video.height"] == 64
|
||||
assert info["video.width"] == 96
|
||||
|
||||
@require_libsvtav1
|
||||
def test_frame_count_and_duration_match_input(self, tmp_path):
|
||||
num_frames = 10
|
||||
fps = 30
|
||||
video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps)
|
||||
|
||||
with av.open(str(video_path)) as container:
|
||||
stream = container.streams.video[0]
|
||||
actual_frames = sum(1 for _ in container.decode(stream))
|
||||
duration = (
|
||||
float(stream.duration * stream.time_base)
|
||||
if stream.duration is not None
|
||||
else float(container.duration / av.time_base)
|
||||
)
|
||||
|
||||
assert actual_frames == num_frames
|
||||
assert abs(duration - num_frames / fps) < 0.1
|
||||
|
||||
def test_overwrite_false_skips_existing_file(self, tmp_path):
|
||||
imgs_dir = tmp_path / "imgs"
|
||||
_write_frames(imgs_dir)
|
||||
video_path = tmp_path / "out.mp4"
|
||||
sentinel = b"pre-existing content"
|
||||
video_path.write_bytes(sentinel)
|
||||
|
||||
encode_video_frames(imgs_dir, video_path, fps=30, overwrite=False)
|
||||
|
||||
assert video_path.read_bytes() == sentinel
|
||||
|
||||
@require_libsvtav1
|
||||
def test_overwrite_true_replaces_existing_file(self, tmp_path):
|
||||
imgs_dir = tmp_path / "imgs"
|
||||
_write_frames(imgs_dir)
|
||||
video_path = tmp_path / "out.mp4"
|
||||
video_path.write_bytes(b"stale content")
|
||||
|
||||
encode_video_frames(imgs_dir, video_path, fps=30, overwrite=True)
|
||||
|
||||
info = get_video_info(video_path)
|
||||
assert info["video.height"] == 64
|
||||
|
||||
@require_libsvtav1
|
||||
def test_custom_encoder_config_fields_stored_in_info(self, tmp_path):
|
||||
"""All stream-derived and encoder config fields are present after encoding."""
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10)
|
||||
video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg)
|
||||
|
||||
info = get_video_info(video_path, camera_encoder_config=cfg)
|
||||
|
||||
# Stream-derived
|
||||
assert info["video.height"] == 64
|
||||
assert info["video.width"] == 96
|
||||
assert info["video.channels"] == 3
|
||||
assert info["video.codec"] == "av1"
|
||||
assert info["video.pix_fmt"] == "yuv420p"
|
||||
assert info["video.fps"] == 30
|
||||
assert info["video.is_depth_map"] is False
|
||||
assert info["has_audio"] is False
|
||||
# Encoder config
|
||||
assert info["video.g"] == 4
|
||||
assert info["video.crf"] == 25
|
||||
assert info["video.preset"] == 10
|
||||
assert info["video.fast_decode"] == 0
|
||||
assert info["video.video_backend"] == "pyav"
|
||||
assert info["video.extra_options"] == {}
|
||||
|
||||
|
||||
class TestConcatenateVideoFiles:
|
||||
def test_two_clips_frame_count(self, tmp_path):
|
||||
"""Output frame count equals the sum of the two input frame counts."""
|
||||
out = tmp_path / "out.mp4"
|
||||
concatenate_video_files(
|
||||
[TEST_ARTIFACTS_DIR / "clip_6frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
|
||||
)
|
||||
|
||||
with av.open(str(out)) as container:
|
||||
total = sum(1 for _ in container.decode(video=0))
|
||||
assert total == 10
|
||||
|
||||
def test_three_clips_frame_count(self, tmp_path):
|
||||
out = tmp_path / "out.mp4"
|
||||
clip = TEST_ARTIFACTS_DIR / "clip_5frames.mp4"
|
||||
concatenate_video_files([clip, clip, clip], out)
|
||||
|
||||
with av.open(str(out)) as container:
|
||||
total = sum(1 for _ in container.decode(video=0))
|
||||
assert total == 15
|
||||
|
||||
@require_libsvtav1
|
||||
def test_geometry_preserved(self, tmp_path):
|
||||
"""Output resolution, fps, codec and pixel format must match the inputs."""
|
||||
out = tmp_path / "out.mp4"
|
||||
concatenate_video_files(
|
||||
[TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
|
||||
)
|
||||
|
||||
info = get_video_info(out)
|
||||
assert info["video.height"] == 64
|
||||
assert info["video.width"] == 96
|
||||
assert info["video.fps"] == 30
|
||||
assert info["video.codec"] == "av1"
|
||||
assert info["video.pix_fmt"] == "yuv420p"
|
||||
|
||||
def test_compatibility_check_raises_on_different_codec(self, tmp_path):
|
||||
with pytest.raises(ValueError):
|
||||
concatenate_video_files(
|
||||
[TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_h264.mp4"],
|
||||
tmp_path / "out.mp4",
|
||||
compatibility_check=True,
|
||||
)
|
||||
|
||||
def test_compatibility_check_raises_on_different_resolution(self, tmp_path):
|
||||
with pytest.raises(ValueError):
|
||||
concatenate_video_files(
|
||||
[TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_32x48.mp4"],
|
||||
tmp_path / "out.mp4",
|
||||
compatibility_check=True,
|
||||
)
|
||||
|
||||
|
||||
class TestEncoderConfigPersistence:
|
||||
"""Encoder config must be stored as ``video.<field>`` entries in
|
||||
``info["features"][key]["info"]`` when the first episode is saved.
|
||||
"""
|
||||
|
||||
@require_libsvtav1
|
||||
def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
|
||||
dataset = empty_lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder_config=cfg
|
||||
)
|
||||
|
||||
_add_frames(dataset, num_frames=4)
|
||||
dataset.save_episode()
|
||||
dataset.finalize()
|
||||
|
||||
info = _read_feature_info(dataset)
|
||||
|
||||
assert info["video.height"] == 64
|
||||
assert info["video.width"] == 96
|
||||
assert info["video.fps"] == 30
|
||||
assert info["video.g"] == 2
|
||||
assert info["video.crf"] == 30
|
||||
assert info["video.preset"] == 12
|
||||
assert info["video.fast_decode"] == 0
|
||||
assert info["video.video_backend"] == "pyav"
|
||||
assert info["video.extra_options"] == {}
|
||||
|
||||
@require_libsvtav1
|
||||
def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory):
|
||||
cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
|
||||
dataset = empty_lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder_config=cfg
|
||||
)
|
||||
|
||||
_add_frames(dataset, num_frames=4)
|
||||
dataset.save_episode()
|
||||
first_info = dict(_read_feature_info(dataset))
|
||||
|
||||
_add_frames(dataset, num_frames=4)
|
||||
dataset.save_episode()
|
||||
dataset.finalize()
|
||||
|
||||
assert _read_feature_info(dataset) == first_info
|
||||
Reference in New Issue
Block a user