diff --git a/docs/source/act.mdx b/docs/source/act.mdx index 453bcbba8..520396ac3 100644 --- a/docs/source/act.mdx +++ b/docs/source/act.mdx @@ -90,6 +90,6 @@ lerobot-record \ --dataset.single_task="Your task description" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --policy.path=${HF_USER}/act_policy ``` diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx index a87bd325b..802e49ae3 100644 --- a/docs/source/earthrover_mini_plus.mdx +++ b/docs/source/earthrover_mini_plus.mdx @@ -194,7 +194,7 @@ lerobot-record \ --dataset.single_task="Navigate around obstacles" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx index 2f53a4d0b..1f74ad73f 100644 --- a/docs/source/groot.mdx +++ b/docs/source/groot.mdx @@ -123,7 +123,7 @@ lerobot-record \ --dataset.single_task="Grab and handover the red cube to the other arm" \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --policy.path=/groot-bimanual \ # your trained model --dataset.episode_time_s=30 \ --dataset.reset_time_s=10 diff --git a/docs/source/hope_jr.mdx b/docs/source/hope_jr.mdx index 8826d9758..6c01bc912 100644 --- a/docs/source/hope_jr.mdx +++ b/docs/source/hope_jr.mdx @@ -232,7 +232,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --display_data=true ``` @@ -278,6 +278,6 @@ lerobot-record \ --dataset.num_episodes=10 \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model ``` diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index ff0a6229e..6bb367c36 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -193,7 +193,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx index 8ab4a5d40..6be937e5e 100644 --- a/docs/source/lerobot-dataset-v3.mdx +++ b/docs/source/lerobot-dataset-v3.mdx @@ -43,7 +43,7 @@ lerobot-record \ --dataset.num_episodes=5 \ --dataset.single_task="Grab the black cube" \ --dataset.streaming_encoding=true \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --dataset.encoder_threads=2 ``` diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx index 1b868711a..b70095960 100644 --- a/docs/source/reachy2.mdx +++ b/docs/source/reachy2.mdx @@ -161,7 +161,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --display_data=true ``` @@ -203,7 +203,7 @@ lerobot-record \ --dataset.private=true \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ --display_data=true ``` diff --git a/docs/source/smolvla.mdx b/docs/source/smolvla.mdx index bf8a0d2f0..8a24806b9 100644 --- a/docs/source/smolvla.mdx +++ b/docs/source/smolvla.mdx @@ -108,7 +108,7 @@ lerobot-record \ --dataset.num_episodes=10 \ --dataset.streaming_encoding=true \ --dataset.encoder_threads=2 \ - # --dataset.vcodec=auto \ + # --dataset.camera_encoder_config.vcodec=auto \ # <- Teleop optional if you want to teleoperate in between episodes \ # --teleop.type=so100_leader \ # --teleop.port=/dev/ttyACM0 \ diff --git a/docs/source/streaming_video_encoding.mdx b/docs/source/streaming_video_encoding.mdx index 40004200e..f4eb34f3f 100644 --- a/docs/source/streaming_video_encoding.mdx +++ b/docs/source/streaming_video_encoding.mdx @@ -14,12 +14,22 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti ## 2. Tuning Parameters -| Parameter | CLI Flag | Type | Default | Description | -| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- | -| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture | -| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | -| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide | -| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM | +All encoding parameters are grouped under `camera_encoder_config` (a `VideoEncoderConfig` dataclass), accessible from the CLI via `--dataset.camera_encoder_config.`. + +| Parameter | CLI Flag | Type | Default | Description | +| ----------------------- | --------------------------------------------- | ------------- | ------------- | ------------------------------------------------------------------- | +| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture | +| `vcodec` | `--dataset.camera_encoder_config.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder | +| `pix_fmt` | `--dataset.camera_encoder_config.pix_fmt` | `str` | `"yuv420p"` | Pixel format | +| `g` | `--dataset.camera_encoder_config.g` | `int \| None` | `2` | GOP size (keyframe interval) | +| `crf` | `--dataset.camera_encoder_config.crf` | `int \| None` | `30` | Quality level (mapped to codec-specific parameter) | +| `preset` | `--dataset.camera_encoder_config.preset` | `int \| None` | `12` | Speed preset (libsvtav1 only, 0 = slowest … 13 = fastest) | +| `fast_decode` | `--dataset.camera_encoder_config.fast_decode` | `int` | `0` | Fast-decode tuning level | +| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance (global). `None` lets the codec decide | +| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM | + +> [!TIP] +> Not all parameters apply to every codec. `VideoEncoderConfig` will warn at startup if you set a parameter that your chosen codec ignores (e.g. `preset` with `h264_nvenc`). ## 3. Performance Considerations @@ -40,7 +50,7 @@ Streaming encoding means the CPU is encoding video **during** the capture loop, ### `encoder_threads` Tuning -This parameter controls how many threads each encoder instance uses internally: +This parameter (`--dataset.encoder_threads`) controls how many threads each encoder instance uses internally: - **Higher values** (e.g., 4-5): Faster encoding, but uses more CPU cores per camera. Good for high-end systems with many cores. - **Lower values** (e.g., 1-2): Less CPU per camera, freeing cores for capture and visualization. Good for low-res images and capable CPUs. @@ -82,15 +92,15 @@ Use HW encoding when: ### Available HW Encoders -| Encoder | Platform | Hardware | CLI Value | -| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ | -| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` | -| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` | -| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` | -| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` | -| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` | -| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` | -| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` | +| Encoder | Platform | Hardware | CLI Value | +| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ---------------------------------------------------------- | +| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder_config.vcodec=h264_videotoolbox` | +| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder_config.vcodec=hevc_videotoolbox` | +| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder_config.vcodec=h264_nvenc` | +| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder_config.vcodec=hevc_nvenc` | +| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder_config.vcodec=h264_vaapi` | +| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder_config.vcodec=h264_qsv` | +| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder_config.vcodec=auto` | > [!NOTE] > In order to use the HW accelerated encoders you might need to upgrade your GPU drivers. @@ -100,15 +110,15 @@ Use HW encoding when: ## 5. Troubleshooting -| Symptom | Likely Cause | Fix | -| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) | -| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). | -| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | -| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | -| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | -| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` | -| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | +| Symptom | Likely Cause | Fix | +| ------------------------------------------------------------------ | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder_config.vcodec=auto`) | +| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder_config.vcodec=auto`). | +| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding | +| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows | +| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` | +| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder_config.vcodec=auto` | +| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. | ## 6. Recommended Configurations @@ -146,10 +156,10 @@ On very constrained systems, streaming encoding may compete too heavily with the # 2camsx 640x480x3 @30fps: Requires some tuning. # Use H.264, disable streaming, consider batching encoding -lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ... +lerobot-record --dataset.camera_encoder_config.vcodec=h264 --dataset.streaming_encoding=false ... ``` ## 7. Closing note Performance ultimately depends on your exact setup — frames-per-second, resolution, CPU cores and load, available memory, episode length, and the encoder you choose. Always test with your target workload, be mindful about your CPU & system capabilities and tune `encoder_threads`, `encoder_queue_maxsize`, and -`vcodec` reasonably. That said, a common practical configuration (for many applications) is three cameras at 640×480x3 @30fps; this usually runs fine with the default streaming video encoding settings in modern systems. Always verify your recorded dataset is healthy by comparing the video duration to the CLI episode duration and confirming the row count equals FPS × CLI duration. +`camera_encoder_config.vcodec` reasonably. That said, a common practical configuration (for many applications) is three cameras at 640×480x3 @30fps; this usually runs fine with the default streaming video encoding settings in modern systems. Always verify your recorded dataset is healthy by comparing the video duration to the CLI episode duration and confirming the row count equals FPS × CLI duration. diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index f7fc9be20..77e05ef7c 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -117,10 +117,10 @@ lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ - --operation.vcodec libsvtav1 \ - --operation.pix_fmt yuv420p \ - --operation.g 2 \ - --operation.crf 30 + --operation.camera_encoder_config.vcodec libsvtav1 \ + --operation.camera_encoder_config.pix_fmt yuv420p \ + --operation.camera_encoder_config.g 2 \ + --operation.camera_encoder_config.crf 30 # Convert only specific episodes lerobot-edit-dataset \ @@ -147,11 +147,14 @@ lerobot-edit-dataset \ **Parameters:** - `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`) -- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`) -- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`) -- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2) -- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30) -- `fast_decode`: Fast decode tuning option (default: 0) +- `camera_encoder_config`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder_config.`: + - `vcodec`: Video codec — `h264`, `hevc`, `libsvtav1`, `auto`, or hardware codecs (default: `libsvtav1`) + - `pix_fmt`: Pixel format — `yuv420p`, `yuv444p` (default: `yuv420p`) + - `g`: GOP size — lower values give better quality but larger files (default: 2) + - `crf`: Quality level — lower is better, 0 is lossless (default: 30) + - `preset`: Speed preset, libsvtav1 only (default: 12) + - `fast_decode`: Fast-decode tuning (default: 0) + - `encoder_threads`: Threads per encoder instance — global setting, separate from `camera_encoder_config` (default: None) - `episode_indices`: List of specific episodes to convert (default: all episodes) - `num_workers`: Number of parallel workers for processing (default: 4)