From bd9619dfc3fadd3647408537d89e82f83f770851 Mon Sep 17 00:00:00 2001
From: Caroline Pascal <caroline8.pascal@gmail.com>
Date: Thu, 14 May 2026 23:46:42 +0200
Subject: [PATCH] feat(encoding parameters): adding support for user provided
 video encoding parameters  (#3455)

* chore(video backend): renaming codec into video_backend in get_safe_default_video_backend()

* feat(pyav utils): adding suport for PyAV encoding parameters validation

* feat(VideoEncoderConfig): creating a VideoEncoderConfig to encapsulate encoding parameters

* feat(VideoEncoderConfig): propagating the VideoEncoderConfig in the codebase

* chore(docs): updating the docs

* feat(metadata): adding encoding parameters in dataset metadata

* fix(concatenation compatibility): adding compatibility check when concatenating video files

* feat(VideoEncoderConfig init): making VideoEncoderConfig more robust and adaptable to multiple backends

* feat(pyav checks): making pyav parameters checks more robust

* chore(duplicate): removing duplicate get_codec_options definition

* test(existing): adapting existing tests

* test(new): adding new tests for encoding related features

* chore(format): fixing formatting issues

* chore(PyAV): cleaning up PyAV utils and encoding parameters checks to stick to the minimun required tooling.

* chore(format): formatting code

* chore(doctrings): updating docstrings

* fix(camera_encoder_config): Removing camera_encoder_config from LeRobotDataset, as it's only required in LeRobotDatasetWriter.

* feat(default values): applying a consistent naming convention for default RGB cameras video encoder parameters

* fix(rollout): propagating VideoEncoderConfig to the latest recording modes

* chore(format): formatting code, fixing error messages and variable names

* fix(arguments order): reverting changes in arguments order in StreamingVideoEncoder

* chore(relative imports): switching to relative local imports within lerobot.datasets

* test(artifacts): cleaning up artifacts for the video encoding tests

* chore(docs): updating docs

* chore(fromat): formatting code

* fix(imports): refactoring the file architecture to avoid circular imports. VideoEncoderConfig is now defined in lerobot.configs and lazily imports av at runtime.

* fix(typos): fixing typos and small mistakes

* test(factories): updating factories

* feat(aggregate): updating dataset aggregation procedure. Encoding tuning paramters (crf, g,...) are ignored for validation and changed to None in the aggregated dataset if incompatible.

* docs(typos): fixing typos

* fix(deletion): reverting unwanted deletion

* fix(typos): fixing multiple typos

* feat(codec options): passing codec options to lerobot_edit_dataset episode deletion tool

* typo(typo): typo

* fix(typos): fixing remaining typos

* chore(rename): renaming camera_encoder_config to camera_encoder

* docs(clean): cleaning and formating docs

* docs(dataset): addind details about datasets

* chore(format): formatting code

* docs(warning): adding warning regarding encoding parameters modification

* fix(re-encoding): removing inconsistent re-encoding option in lerobot_edit_dataset

* typos(typos): typos

* chore(format): resolving prettier issues

* fix(h264_nvenc): fixing crf handling for h264_nvenc

* docs(clean): removing too technical parts of the docs

* fix(imports): fixing imports at the __init__ level

* fix(imports): fixing not very pretty imports in video config file
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/act.mdx                           |   2 +-
 docs/source/earthrover_mini_plus.mdx          |   2 +-
 docs/source/groot.mdx                         |   2 +-
 docs/source/hope_jr.mdx                       |   4 +-
 docs/source/il_robots.mdx                     |   2 +-
 docs/source/lerobot-dataset-v3.mdx            |   2 +-
 docs/source/reachy2.mdx                       |   4 +-
 docs/source/smolvla.mdx                       |   2 +-
 docs/source/streaming_video_encoding.mdx      |  44 +-
 docs/source/using_dataset_tools.mdx           |  14 +-
 docs/source/video_encoding_parameters.mdx     | 117 ++++
 src/lerobot/configs/__init__.py               |  12 +
 src/lerobot/configs/dataset.py                |  11 +-
 src/lerobot/configs/default.py                |   4 +-
 src/lerobot/configs/eval.py                   |   2 +-
 src/lerobot/configs/rewards.py                |   3 +-
 src/lerobot/configs/train.py                  |   2 +-
 src/lerobot/configs/video.py                  | 235 +++++++
 src/lerobot/datasets/__init__.py              |   3 +
 src/lerobot/datasets/aggregate.py             |  56 +-
 src/lerobot/datasets/dataset_metadata.py      |  20 +-
 src/lerobot/datasets/dataset_tools.py         |  94 ++-
 src/lerobot/datasets/dataset_writer.py        |  34 +-
 src/lerobot/datasets/feature_utils.py         |  36 ++
 src/lerobot/datasets/lerobot_dataset.py       |  79 ++-
 src/lerobot/datasets/pyav_utils.py            | 174 +++++
 src/lerobot/datasets/video_utils.py           | 228 +++----
 src/lerobot/policies/eo1/modeling_eo1.py      |   5 +-
 src/lerobot/policies/eo1/processor_eo1.py     |   3 +-
 .../rewards/classifier/modeling_classifier.py |   5 +-
 .../classifier/processor_classifier.py        |   3 +-
 src/lerobot/rewards/factory.py                |   7 +-
 .../rewards/sarm/compute_rabc_weights.py      |   7 +-
 src/lerobot/rewards/sarm/modeling_sarm.py     |   9 +-
 src/lerobot/rewards/sarm/processor_sarm.py    |   9 +-
 src/lerobot/rollout/context.py                |   4 +-
 src/lerobot/scripts/lerobot_edit_dataset.py   |  16 +-
 src/lerobot/scripts/lerobot_record.py         |  31 +-
 src/lerobot/scripts/lerobot_rollout.py        |  12 +
 src/lerobot/transport/utils.py                |   3 +-
 src/lerobot/utils/import_utils.py             |   5 +-
 tests/artifacts/encoded_videos/clip_32x48.mp4 |   3 +
 .../artifacts/encoded_videos/clip_4frames.mp4 |   3 +
 .../artifacts/encoded_videos/clip_5frames.mp4 |   3 +
 .../artifacts/encoded_videos/clip_6frames.mp4 |   3 +
 tests/artifacts/encoded_videos/clip_h264.mp4  |   3 +
 tests/datasets/test_aggregate.py              |  76 ++-
 tests/datasets/test_dataset_reader.py         |   6 +-
 tests/datasets/test_dataset_tools.py          |  13 +-
 tests/datasets/test_dataset_writer.py         |  24 +-
 tests/datasets/test_datasets.py               |  14 +-
 .../datasets/test_streaming_video_encoder.py  | 221 +++----
 tests/datasets/test_video_encoding.py         | 595 ++++++++++++++++++
 tests/fixtures/constants.py                   |  14 +-
 tests/fixtures/dataset_factories.py           |   5 +-
 56 files changed, 1765 insertions(+), 527 deletions(-)
 create mode 100644 docs/source/video_encoding_parameters.mdx
 create mode 100644 src/lerobot/configs/video.py
 create mode 100644 src/lerobot/datasets/pyav_utils.py
 create mode 100644 tests/artifacts/encoded_videos/clip_32x48.mp4
 create mode 100644 tests/artifacts/encoded_videos/clip_4frames.mp4
 create mode 100644 tests/artifacts/encoded_videos/clip_5frames.mp4
 create mode 100644 tests/artifacts/encoded_videos/clip_6frames.mp4
 create mode 100644 tests/artifacts/encoded_videos/clip_h264.mp4
 create mode 100644 tests/datasets/test_video_encoding.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index de4eeaa28..f1dfe9aae 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -41,6 +41,8 @@
     title: Using the Dataset Tools
   - local: dataset_subtask
     title: Using Subtasks in the Dataset
+  - local: video_encoding_parameters
+    title: Video encoding parameters
   - local: streaming_video_encoding
     title: Streaming Video Encoding
   title: "Datasets"
diff --git a/docs/source/act.mdx b/docs/source/act.mdx
index 453bcbba8..8e91edcf9 100644
--- a/docs/source/act.mdx
+++ b/docs/source/act.mdx
@@ -90,6 +90,6 @@ lerobot-record \
   --dataset.single_task="Your task description" \
   --dataset.streaming_encoding=true \
   --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
+  # --dataset.camera_encoder.vcodec=auto \
   --policy.path=${HF_USER}/act_policy
 ```
diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx
index a87bd325b..508c0e3a9 100644
--- a/docs/source/earthrover_mini_plus.mdx
+++ b/docs/source/earthrover_mini_plus.mdx
@@ -194,7 +194,7 @@ lerobot-record \
     --dataset.single_task="Navigate around obstacles" \
     --dataset.streaming_encoding=true \
     --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
+    # --dataset.camera_encoder.vcodec=auto \
     --display_data=true
 ```
 
diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx
index 2f53a4d0b..d69d10a57 100644
--- a/docs/source/groot.mdx
+++ b/docs/source/groot.mdx
@@ -123,7 +123,7 @@ lerobot-record \
   --dataset.single_task="Grab and handover the red cube to the other arm" \
   --dataset.streaming_encoding=true \
   --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
+  # --dataset.camera_encoder.vcodec=auto \
   --policy.path=<user>/groot-bimanual \ # your trained model
   --dataset.episode_time_s=30 \
   --dataset.reset_time_s=10
diff --git a/docs/source/hope_jr.mdx b/docs/source/hope_jr.mdx
index 8826d9758..1f3b08fd7 100644
--- a/docs/source/hope_jr.mdx
+++ b/docs/source/hope_jr.mdx
@@ -232,7 +232,7 @@ lerobot-record \
     --dataset.private=true \
     --dataset.streaming_encoding=true \
     --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
+    # --dataset.camera_encoder.vcodec=auto \
     --display_data=true
 ```
 
@@ -278,6 +278,6 @@ lerobot-record \
   --dataset.num_episodes=10 \
   --dataset.streaming_encoding=true \
   --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
+  # --dataset.camera_encoder.vcodec=auto \
   --policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
 ```
diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx
index ff0a6229e..07789225a 100644
--- a/docs/source/il_robots.mdx
+++ b/docs/source/il_robots.mdx
@@ -193,7 +193,7 @@ lerobot-record \
     --dataset.num_episodes=5 \
     --dataset.single_task="Grab the black cube" \
     --dataset.streaming_encoding=true \
-    # --dataset.vcodec=auto \
+    # --dataset.camera_encoder.vcodec=auto \
     --dataset.encoder_threads=2
 ```
 </hfoption>
diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx
index 8ab4a5d40..6f3e6d948 100644
--- a/docs/source/lerobot-dataset-v3.mdx
+++ b/docs/source/lerobot-dataset-v3.mdx
@@ -43,7 +43,7 @@ lerobot-record \
   --dataset.num_episodes=5 \
   --dataset.single_task="Grab the black cube" \
   --dataset.streaming_encoding=true \
-  # --dataset.vcodec=auto \
+  # --dataset.camera_encoder.vcodec=auto \
   --dataset.encoder_threads=2
 ```
 
diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx
index 1b868711a..4b08569db 100644
--- a/docs/source/reachy2.mdx
+++ b/docs/source/reachy2.mdx
@@ -161,7 +161,7 @@ lerobot-record \
     --dataset.private=true \
     --dataset.streaming_encoding=true \
     --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
+    # --dataset.camera_encoder.vcodec=auto \
     --display_data=true
 ```
 
@@ -203,7 +203,7 @@ lerobot-record \
     --dataset.private=true \
     --dataset.streaming_encoding=true \
     --dataset.encoder_threads=2 \
-    # --dataset.vcodec=auto \
+    # --dataset.camera_encoder.vcodec=auto \
     --display_data=true
 ```
 
diff --git a/docs/source/smolvla.mdx b/docs/source/smolvla.mdx
index bf8a0d2f0..6c63c5d11 100644
--- a/docs/source/smolvla.mdx
+++ b/docs/source/smolvla.mdx
@@ -108,7 +108,7 @@ lerobot-record \
   --dataset.num_episodes=10 \
   --dataset.streaming_encoding=true \
   --dataset.encoder_threads=2 \
-  # --dataset.vcodec=auto \
+  # --dataset.camera_encoder.vcodec=auto \
   # <- Teleop optional if you want to teleoperate in between episodes \
   # --teleop.type=so100_leader \
   # --teleop.port=/dev/ttyACM0 \
diff --git a/docs/source/streaming_video_encoding.mdx b/docs/source/streaming_video_encoding.mdx
index 40004200e..96e049eb3 100644
--- a/docs/source/streaming_video_encoding.mdx
+++ b/docs/source/streaming_video_encoding.mdx
@@ -17,9 +17,9 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti
 | Parameter               | CLI Flag                          | Type          | Default       | Description                                                       |
 | ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
 | `streaming_encoding`    | `--dataset.streaming_encoding`    | `bool`        | `True`        | Enable real-time encoding during capture                          |
-| `vcodec`                | `--dataset.vcodec`                | `str`         | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder                     |
+| `vcodec`                | `--dataset.camera_encoder.vcodec` | `str`         | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder                     |
 | `encoder_threads`       | `--dataset.encoder_threads`       | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
-| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int`         | `60`          | Max buffered frames per camera (~2s at 30fps). Consumes RAM       |
+| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int`         | `30`          | Max buffered frames per camera (~1s at 30fps). Consumes RAM       |
 
 ## 3. Performance Considerations
 
@@ -48,7 +48,7 @@ This parameter controls how many threads each encoder instance uses internally:
 
 ### Backpressure and Frame Dropping
 
-Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
+Each camera has a bounded queue (`encoder_queue_maxsize`, default 30 frames). When the encoder can't keep up:
 
 1. The queue fills up (consuming RAM)
 2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
@@ -82,15 +82,15 @@ Use HW encoding when:
 
 ### Available HW Encoders
 
-| Encoder             | Platform      | Hardware                                                                                         | CLI Value                            |
-| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
-| `h264_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.vcodec=h264_videotoolbox` |
-| `hevc_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.vcodec=hevc_videotoolbox` |
-| `h264_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.vcodec=h264_nvenc`        |
-| `hevc_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.vcodec=hevc_nvenc`        |
-| `h264_vaapi`        | Linux         | Intel/AMD GPU                                                                                    | `--dataset.vcodec=h264_vaapi`        |
-| `h264_qsv`          | Linux/Windows | Intel Quick Sync                                                                                 | `--dataset.vcodec=h264_qsv`          |
-| `auto`              | Any           | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto`              |
+| Encoder             | Platform      | Hardware                                                                                         | CLI Value                                           |
+| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | --------------------------------------------------- |
+| `h264_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.camera_encoder.vcodec=h264_videotoolbox` |
+| `hevc_videotoolbox` | macOS         | Apple Silicon / Intel                                                                            | `--dataset.camera_encoder.vcodec=hevc_videotoolbox` |
+| `h264_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.camera_encoder.vcodec=h264_nvenc`        |
+| `hevc_nvenc`        | Linux/Windows | NVIDIA GPU                                                                                       | `--dataset.camera_encoder.vcodec=hevc_nvenc`        |
+| `h264_vaapi`        | Linux         | Intel/AMD GPU                                                                                    | `--dataset.camera_encoder.vcodec=h264_vaapi`        |
+| `h264_qsv`          | Linux/Windows | Intel Quick Sync                                                                                 | `--dataset.camera_encoder.vcodec=h264_qsv`          |
+| `auto`              | Any           | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder.vcodec=auto`              |
 
 > [!NOTE]
 > In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
@@ -100,15 +100,15 @@ Use HW encoding when:
 
 ## 5. Troubleshooting
 
-| Symptom                                                            | Likely Cause                                 | Fix                                                                                                                                                                                                                                                                                  |
-| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage)                | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
-| "Encoder queue full" warnings or dropped frames in dataset         | Encoder can't keep up (Queue overflow)       | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`).                                                                                                                                                    |
-| High RAM usage                                                     | Queue filling faster than encoding           | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding                                                                                                                                                                                     |
-| Large video files                                                  | Using HW encoder or H.264                    | Expected trade-off. Switch to `libsvtav1` if CPU allows                                                                                                                                                                                                                              |
-| `save_episode()` still slow                                        | `streaming_encoding` is `False`              | Set `--dataset.streaming_encoding=true`                                                                                                                                                                                                                                              |
-| Encoder thread crash                                               | Codec not available or invalid settings      | Check `vcodec` is installed, try `--dataset.vcodec=auto`                                                                                                                                                                                                                             |
-| Recorded dataset is missing frames                                 | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected.                                   |
+| Symptom                                                            | Likely Cause                                 | Fix                                                                                                                                                                                                                                                                                                 |
+| ------------------------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage)                | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder.vcodec=auto`) |
+| "Encoder queue full" warnings or dropped frames in dataset         | Encoder can't keep up (Queue overflow)       | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder.vcodec=auto`).                                                                                                                                                    |
+| High RAM usage                                                     | Queue filling faster than encoding           | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding                                                                                                                                                                                                    |
+| Large video files                                                  | Using HW encoder or H.264                    | Expected trade-off. Switch to `libsvtav1` if CPU allows                                                                                                                                                                                                                                             |
+| `save_episode()` still slow                                        | `streaming_encoding` is `False`              | Set `--dataset.streaming_encoding=true`                                                                                                                                                                                                                                                             |
+| Encoder thread crash                                               | Codec not available or invalid settings      | Check `vcodec` is installed, try `--dataset.camera_encoder.vcodec=auto`                                                                                                                                                                                                                             |
+| Recorded dataset is missing frames                                 | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected.                                                  |
 
 ## 6. Recommended Configurations
 
@@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the
 # 2camsx 640x480x3 @30fps: Requires some tuning.
 
 # Use H.264, disable streaming, consider batching encoding
-lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
+lerobot-record --dataset.camera_encoder.vcodec=h264 --dataset.streaming_encoding=false ...
 ```
 
 ## 7. Closing note
diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx
index f7fc9be20..49247a6c1 100644
--- a/docs/source/using_dataset_tools.mdx
+++ b/docs/source/using_dataset_tools.mdx
@@ -117,10 +117,10 @@ lerobot-edit-dataset \
     --repo_id lerobot/pusht_image \
     --operation.type convert_image_to_video \
     --operation.output_dir outputs/pusht_video \
-    --operation.vcodec libsvtav1 \
-    --operation.pix_fmt yuv420p \
-    --operation.g 2 \
-    --operation.crf 30
+    --operation.camera_encoder.vcodec libsvtav1 \
+    --operation.camera_encoder.pix_fmt yuv420p \
+    --operation.camera_encoder.g 2 \
+    --operation.camera_encoder.crf 30
 
 # Convert only specific episodes
 lerobot-edit-dataset \
@@ -147,11 +147,7 @@ lerobot-edit-dataset \
 **Parameters:**
 
 - `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`)
-- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`)
-- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`)
-- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2)
-- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30)
-- `fast_decode`: Fast decode tuning option (default: 0)
+- `camera_encoder`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder.<field>. See [Video Encoding Parameters](./video_encoding_parameters) for more details.
 - `episode_indices`: List of specific episodes to convert (default: all episodes)
 - `num_workers`: Number of parallel workers for processing (default: 4)
 
diff --git a/docs/source/video_encoding_parameters.mdx b/docs/source/video_encoding_parameters.mdx
new file mode 100644
index 000000000..0b5b99b2b
--- /dev/null
+++ b/docs/source/video_encoding_parameters.mdx
@@ -0,0 +1,117 @@
+# Video encoding parameters
+
+When video storage is enabled, LeRobot stores each camera stream as an **MP4** file instead of saving one image file per timestep. Video encoding compresses across time, which usually cuts dataset size and I/O compared to a pile of PNG, while keeping MP4 — a format every player and loader understands.
+
+Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `camera_encoder`, a nested `VideoEncoderConfig` (`lerobot.configs.video.VideoEncoderConfig`) passed through PyAV.
+
+You can set these parameters from the CLI with `--dataset.camera_encoder.<field>` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run.
+
+<Tip>
+  Video storage must be on for `camera_encoder` to have any effect —
+  `use_videos=True` in Python APIs, or `--dataset.video=true` on the CLI (the
+  recording default). With video off, inputs stay as images and `camera_encoder`
+  is ignored.
+</Tip>
+
+For details on **when** frames are written vs. encoded (streaming vs. post-episode), queues, and other top-level `--dataset.*` switches, see [Streaming Video Encoding](./streaming_video_encoding). For an encoding-parameter comparison and experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark).
+
+---
+
+## Example
+
+```bash
+lerobot-record \
+    --robot.type=so100_follower \
+    --robot.port=/dev/tty.usbmodem58760431541 \
+    --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+    --robot.id=black \
+    --teleop.type=so100_leader \
+    --teleop.port=/dev/tty.usbmodem58760431551 \
+    --teleop.id=blue \
+    --dataset.repo_id=<my_username>/<my_dataset_name> \
+    --dataset.num_episodes=2 \
+    --dataset.single_task="Grab the cube" \
+    --dataset.streaming_encoding=true \
+    --dataset.encoder_threads=2 \
+    --dataset.camera_encoder.vcodec=h264 \
+    --dataset.camera_encoder.preset=fast \
+    --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \
+    --display_data=true
+```
+
+---
+
+## Tuning parameters
+
+<Tip warning={true}>
+The defaults are tuned to balance **compression ratio**, **visual quality**, and **decoding/seek speed** for typical robotics datasets. Changing them can affect both recording (CPU load, frame drops) and training (decoding throughput, image quality).
+
+Only override these parameters if you have a specific reason to, and measure the impact on your pipeline before relying on the new settings.
+
+</Tip>
+
+All flags below are prefixed with `--dataset.camera_encoder.` on the CLI.
+
+| Parameter       | Type             | Default       | Description                                                                                                                                                                            |
+| --------------- | ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vcodec`        | `str`            | `"libsvtav1"` | Video codec name. `"auto"` picks the first available hardware encoder from a fixed preference list, falling back to `libsvtav1`.                                                       |
+| `pix_fmt`       | `str`            | `"yuv420p"`   | Output pixel format. Must be supported by the chosen codec in your FFmpeg build.                                                                                                       |
+| `g`             | `int`            | `2`           | GOP size — a keyframe every `g` frames. Emitted as FFmpeg option `g`.                                                                                                                  |
+| `crf`           | `int` or `float` | `30`          | Abstract quality value, mapped per codec (see the [mapping](#mapping-videoencoderconfig--ffmpeg-options) below). Lower → higher quality / larger output where the mapping is monotone. |
+| `preset`        | `int` or `str`   | `12` \*       | Encoder speed preset; meaning depends on the codec. <br/>\* When unset and `vcodec=libsvtav1`, LeRobot defaults to `12`.                                                               |
+| `fast_decode`   | `int`            | `0`           | `libsvtav1`: `0–2`, passed via `svtav1-params`. <br/>`h264` / `hevc` (software): if `>0`, sets `tune=fastdecode`. <br/>Other codecs: usually unused.                                   |
+| `video_backend` | `str`            | `"pyav"`      | Only `"pyav"` is currently implemented for video encoding.                                                                                                                             |
+| `extra_options` | `dict`           | `{}`          | Extra FFmpeg or codec specific options merged after the structured fields above. Cannot override keys already set by those fields.                                                     |
+
+---
+
+## Persistence in dataset metadata
+
+After the first episode of a video stream is encoded, the encoder configuration is **persisted into the dataset metadata** (`meta/info.json`) under each video feature, alongside the values probed from the file itself. For a video feature `observation.images.<camera>`, the layout in `info.json` is:
+
+```json
+{
+  "features": {
+    "observation.images.laptop": {
+      "dtype": "video",
+      "shape": [480, 640, 3],
+      "info": {
+        "video.height": 480,
+        "video.width": 640,
+        "video.codec": "h264",
+        "video.pix_fmt": "yuv420p",
+        "video.fps": 30,
+        "video.channels": 3,
+        "video.is_depth_map": false,
+        "video.g": 2,
+        "video.crf": 30,
+        "video.preset": "fast",
+        "video.fast_decode": 0,
+        "video.video_backend": "pyav",
+        "video.extra_options": { "tune": "film", "profile:v": "high", "bf": 2 }
+      }
+    }
+  }
+}
+```
+
+Two sources contribute to the `info` block:
+
+- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present.
+- **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`.
+
+<Tip>
+  This block is populated **once**, from the **first** episode. It assumes every
+  episode in the dataset was encoded with the same `camera_encoder`. Changing
+  encoder settings partway through a recording is not supported — the
+  `info.json` will only reflect the parameters used for the first episode.
+</Tip>
+
+---
+
+## Merging datasets
+
+When aggregating datasets with `merge_datasets`, video files are concatenated as-is (no re-encoding), and encoder fields in `info.json` are merged per-key:
+
+- **Stream-derived fields must match** across sources: `video.codec`, `video.pix_fmt`, `video.height`, `video.width`, `video.fps`. Otherwise FFmpeg's concat demuxer fails.
+- **Encoder-tuning fields are merged loosely**: `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.extra_options`. If every source agrees, the value is kept; if not, it's set to `null` (or `{}` for `video.extra_options`) and a warning is logged.
diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py
index ab74c3cd3..c3fe246cd 100644
--- a/src/lerobot/configs/__init__.py
+++ b/src/lerobot/configs/__init__.py
@@ -31,6 +31,12 @@ from .types import (
     PolicyFeature,
     RTCAttentionSchedule,
 )
+from .video import (
+    VALID_VIDEO_CODECS,
+    VIDEO_ENCODER_INFO_KEYS,
+    VideoEncoderConfig,
+    camera_encoder_defaults,
+)
 
 __all__ = [
     # Types
@@ -46,4 +52,10 @@ __all__ = [
     "PeftConfig",
     "PreTrainedConfig",
     "WandBConfig",
+    "VideoEncoderConfig",
+    # Defaults
+    "camera_encoder_defaults",
+    # Constants
+    "VALID_VIDEO_CODECS",
+    "VIDEO_ENCODER_INFO_KEYS",
 ]
diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py
index e3e17e62b..d5c6fa312 100644
--- a/src/lerobot/configs/dataset.py
+++ b/src/lerobot/configs/dataset.py
@@ -14,10 +14,12 @@
 
 """Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``."""
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 
+from .video import VideoEncoderConfig, camera_encoder_defaults
+
 
 @dataclass
 class DatasetRecordConfig:
@@ -55,10 +57,9 @@ class DatasetRecordConfig:
     # Number of episodes to record before batch encoding videos
     # Set to 1 for immediate encoding (default behavior), or higher for batched encoding
     video_encoding_batch_size: int = 1
-    # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
-    # or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
-    # Use 'auto' to auto-detect the best available hardware encoder.
-    vcodec: str = "libsvtav1"
+    # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
+    # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
+    camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
     # Enable streaming video encoding: encode frames in real-time during capture instead
     # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
     streaming_encoding: bool = False
diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py
index b1eebba94..b809e71d9 100644
--- a/src/lerobot/configs/default.py
+++ b/src/lerobot/configs/default.py
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, field
 
 from lerobot.transforms import ImageTransformsConfig
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.utils.import_utils import get_safe_default_video_backend
 
 
 @dataclass
@@ -34,7 +34,7 @@ class DatasetConfig:
     image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
     revision: str | None = None
     use_imagenet_stats: bool = True
-    video_backend: str = field(default_factory=get_safe_default_codec)
+    video_backend: str = field(default_factory=get_safe_default_video_backend)
     # When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
     # This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
     return_uint8: bool = False
diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py
index f2a1d3065..c285025ad 100644
--- a/src/lerobot/configs/eval.py
+++ b/src/lerobot/configs/eval.py
@@ -18,8 +18,8 @@ from logging import getLogger
 from pathlib import Path
 
 from lerobot import envs, policies  # noqa: F401
-from lerobot.configs import parser
 
+from . import parser
 from .default import EvalConfig
 from .policies import PreTrainedConfig
 
diff --git a/src/lerobot/configs/rewards.py b/src/lerobot/configs/rewards.py
index d495160bf..a53d5a417 100644
--- a/src/lerobot/configs/rewards.py
+++ b/src/lerobot/configs/rewards.py
@@ -27,12 +27,13 @@ from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import CONFIG_NAME
 from huggingface_hub.errors import HfHubHTTPError
 
-from lerobot.configs.types import PolicyFeature
 from lerobot.optim.optimizers import OptimizerConfig
 from lerobot.optim.schedulers import LRSchedulerConfig
 from lerobot.utils.device_utils import auto_select_torch_device, is_torch_device_available
 from lerobot.utils.hub import HubMixin
 
+from .types import PolicyFeature
+
 T = TypeVar("T", bound="RewardModelConfig")
 logger = logging.getLogger(__name__)
 
diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py
index c5b4ff5f5..55498d3ac 100644
--- a/src/lerobot/configs/train.py
+++ b/src/lerobot/configs/train.py
@@ -25,11 +25,11 @@ from huggingface_hub import hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 
 from lerobot import envs
-from lerobot.configs import parser
 from lerobot.optim import LRSchedulerConfig, OptimizerConfig
 from lerobot.utils.hub import HubMixin
 from lerobot.utils.sample_weighting import SampleWeightingConfig
 
+from . import parser
 from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
 from .policies import PreTrainedConfig
 from .rewards import RewardModelConfig
diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py
new file mode 100644
index 000000000..bf2471453
--- /dev/null
+++ b/src/lerobot/configs/video.py
@@ -0,0 +1,235 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Note: We subclass str so that serialization is straightforward
+# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
+
+"""Video encoder configurations."""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+from lerobot.utils.import_utils import require_package
+
+logger = logging.getLogger(__name__)
+
+# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend.
+# Determines the order of preference for auto-selection when vcodec="auto" is used.
+HW_VIDEO_CODECS = [
+    "h264_videotoolbox",  # macOS
+    "hevc_videotoolbox",  # macOS
+    "h264_nvenc",  # NVIDIA GPU
+    "hevc_nvenc",  # NVIDIA GPU
+    "h264_vaapi",  # Linux Intel/AMD
+    "h264_qsv",  # Intel Quick Sync
+]
+VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
+# Aliases for legacy video codec names.
+VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
+
+
+LIBSVTAV1_DEFAULT_PRESET: int = 12
+
+# Keys persisted under ``features[*]["info"]`` as ``video.<name>`` (from :class:`VideoEncoderConfig`).
+# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly.
+VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset(
+    {"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"}
+)
+VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset(
+    f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES
+)
+
+
+@dataclass
+class VideoEncoderConfig:
+    """Video encoder configuration.
+
+    Attributes:
+        vcodec: Video encoder name. ``"auto"`` is resolved during
+            construction (HW encoder if available, else ``libsvtav1``).
+        pix_fmt: Pixel format (e.g. ``"yuv420p"``).
+        g: GOP size (keyframe interval).
+        crf: Quality level — mapped to the native quality parameter of the
+            codec (``crf`` for software, ``qp`` for NVENC/VAAPI,
+            ``q:v`` for VideoToolbox, ``global_quality`` for QSV).
+        preset: Speed/quality preset. Accepted type is per-codec.
+        fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2)
+            embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values
+            set ``tune=fastdecode``. Ignored for other codecs.
+        video_backend: Python to be used for encoding. Only ``"pyav"``
+            is currently supported.
+        extra_options: Free-form dictionary of additional video encoder options
+            (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``).
+    """
+
+    vcodec: str = "libsvtav1"  # TODO(CarolinePascal): rename to codec ?
+    pix_fmt: str = "yuv420p"
+    g: int | None = 2
+    crf: int | float | None = 30
+    preset: int | str | None = None
+    fast_decode: int = 0
+    # TODO(CarolinePascal): add torchcodec support + find a way to unify the
+    # two backends (encoding and decoding).
+    video_backend: str = "pyav"
+    extra_options: dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        self.resolve_vcodec()
+        # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work".
+        if self.preset is None and self.vcodec == "libsvtav1":
+            self.preset = LIBSVTAV1_DEFAULT_PRESET
+        self.validate()
+
+    @classmethod
+    def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig:
+        """Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block.
+        Missing or ``None`` values fall back to the class defaults.
+        """
+        video_info = video_info or {}
+        kwargs: dict[str, Any] = {}
+
+        for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")):
+            value = video_info.get(src_key)
+            if value is not None:
+                kwargs[dst_field] = value
+
+        for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES:
+            value = video_info.get(f"video.{field_name}")
+            if value is None:
+                continue
+            # Persisted as ``{}`` after merges with disagreeing sources — treat as default.
+            if field_name == "extra_options" and not value:
+                continue
+            kwargs[field_name] = value
+
+        return cls(**kwargs)
+
+    def detect_available_encoders(self, encoders: list[str] | str) -> list[str]:
+        """Return the subset of available encoders based on the specified video backend.
+
+        Args:
+            encoders: List of encoder names to detect. If a string, it is converted to a list.
+        Returns:
+            List of available encoder names. If the video backend is not "pyav", returns an empty list.
+        """
+        if self.video_backend == "pyav":
+            require_package("av", extra="dataset")
+            from lerobot.datasets import detect_available_encoders_pyav
+
+            return detect_available_encoders_pyav(encoders)
+        return []
+
+    def validate(self) -> None:
+        """Validate the video encoder configuration."""
+        if self.video_backend == "pyav":
+            require_package("av", extra="dataset")
+            from lerobot.datasets import check_video_encoder_parameters_pyav
+
+            check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options())
+
+    def resolve_vcodec(self) -> None:
+        """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder.
+
+        For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the
+        resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``.
+
+        Stream-derived canonical codec names listed in :data:`VIDEO_CODECS_ALIASES` are
+        rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``).
+        """
+        self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec)
+        if self.vcodec not in VALID_VIDEO_CODECS:
+            raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
+        if self.vcodec == "auto":
+            available = self.detect_available_encoders(HW_VIDEO_CODECS)
+            for encoder in HW_VIDEO_CODECS:
+                if encoder in available:
+                    logger.info(f"Auto-selected video codec: {encoder}")
+                    self.vcodec = encoder
+                    return
+            logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'")
+            self.vcodec = "libsvtav1"
+
+        if self.detect_available_encoders(self.vcodec):
+            logger.info(f"Using video codec: {self.vcodec}")
+            return
+        raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}")
+
+    def get_codec_options(
+        self, encoder_threads: int | None = None, as_strings: bool = False
+    ) -> dict[str, Any]:
+        """Translate the tuning fields to codec-specific options.
+
+        ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field.
+
+        Args:
+            encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs.
+                For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``.
+                For h264/hevc, this is mapped to ``threads``.
+                Hardware encoders ignore this parameter.
+            as_strings: If ``True``, casts values to strings.
+        """
+        opts: dict[str, Any] = {}
+
+        def set_if(key: str, value: Any) -> None:
+            if value is not None:
+                opts[key] = value if not as_strings else str(value)
+
+        # GOP size is not a codec-specific option, so it is always set.
+        set_if("g", self.g)
+
+        if self.vcodec == "libsvtav1":
+            set_if("crf", self.crf)
+            set_if("preset", self.preset)
+            svtav1_parts: list[str] = []
+            if self.fast_decode is not None:
+                svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}")
+            if encoder_threads is not None:
+                svtav1_parts.append(f"lp={encoder_threads}")
+            if svtav1_parts:
+                opts["svtav1-params"] = ":".join(svtav1_parts)
+        elif self.vcodec in ("h264", "hevc"):
+            set_if("crf", self.crf)
+            set_if("preset", self.preset)
+            if self.fast_decode:
+                opts["tune"] = "fastdecode"
+            set_if("threads", encoder_threads)
+        elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
+            if self.crf is not None:
+                opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
+        elif self.vcodec in ("h264_nvenc", "hevc_nvenc"):
+            opts["rc"] = 0
+            set_if("qp", self.crf)
+            set_if("preset", self.preset)
+        elif self.vcodec == "h264_vaapi":
+            set_if("qp", self.crf)
+        elif self.vcodec == "h264_qsv":
+            set_if("global_quality", self.crf)
+            set_if("preset", self.preset)
+        else:
+            set_if("crf", self.crf)
+            set_if("preset", self.preset)
+
+        # Extra options are merged last but never override structured fields (values are kept as given).
+        for k, v in self.extra_options.items():
+            if k not in opts:
+                set_if(k, v)
+
+        return opts
+
+
+def camera_encoder_defaults() -> VideoEncoderConfig:
+    """Return a :class:`VideoEncoderConfig` with RGB-camera defaults."""
+    return VideoEncoderConfig()
diff --git a/src/lerobot/datasets/__init__.py b/src/lerobot/datasets/__init__.py
index 6c42959a5..b51ef0222 100644
--- a/src/lerobot/datasets/__init__.py
+++ b/src/lerobot/datasets/__init__.py
@@ -40,6 +40,7 @@ from .io_utils import load_episodes, write_stats
 from .lerobot_dataset import LeRobotDataset
 from .multi_dataset import MultiLeRobotDataset
 from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
+from .pyav_utils import check_video_encoder_parameters_pyav, detect_available_encoders_pyav
 from .sampler import EpisodeAwareSampler
 from .streaming_dataset import StreamingLeRobotDataset
 from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card
@@ -59,6 +60,8 @@ __all__ = [
     "MultiLeRobotDataset",
     "StreamingLeRobotDataset",
     "VideoEncodingManager",
+    "check_video_encoder_parameters_pyav",
+    "detect_available_encoders_pyav",
     "add_features",
     "aggregate_datasets",
     "aggregate_pipeline_dataset_features",
diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py
index 90fc8f583..5db3f934d 100644
--- a/src/lerobot/datasets/aggregate.py
+++ b/src/lerobot/datasets/aggregate.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import shutil
 from pathlib import Path
@@ -23,9 +24,11 @@ import datasets
 import pandas as pd
 import tqdm
 
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
+
 from .compute_stats import aggregate_stats
 from .dataset_metadata import LeRobotDatasetMetadata
-from .feature_utils import get_hf_features_from_features
+from .feature_utils import features_equal_for_merge, get_hf_features_from_features
 from .io_utils import (
     get_file_size_in_mb,
     get_parquet_file_size_in_mb,
@@ -46,11 +49,54 @@ from .utils import (
 from .video_utils import concatenate_video_files, get_video_duration_in_s
 
 
+def merge_video_feature_info_for_aggregate(all_metadata: list[LeRobotDatasetMetadata]) -> dict[str, dict]:
+    """Create a merged video feature info dictionary for aggregation. The video encoder info is merged field-by-field: each key is kept only when every source agrees; otherwise that key is set to ``null`` (or ``{}`` for ``video.extra_options``) and a warning is logged.
+
+    Args:
+        all_metadata: List of LeRobotDatasetMetadata objects to merge.
+
+    Returns:
+        dict: A dictionary of merged video feature info.
+    """
+    merged_info = copy.deepcopy(all_metadata[0].features)
+    video_keys = [k for k in merged_info if merged_info[k].get("dtype") == "video"]
+
+    for vk in video_keys:
+        video_infos = [m.features.get(vk, {}).get("info") or {} for m in all_metadata]
+        base_video_info = video_infos[0]
+
+        merged_encoder_info: dict = {}
+        fallback_keys: list[str] = []
+        for info_key in VIDEO_ENCODER_INFO_KEYS:
+            values = [info.get(info_key, None) for info in video_infos]
+            first_value = values[0]
+            all_match = all(v == first_value for v in values[1:])
+
+            if all_match:
+                merged_encoder_info[info_key] = first_value
+            else:
+                fallback_keys.append(info_key)
+                merged_encoder_info[info_key] = {} if info_key == "video.extra_options" else None
+
+        if fallback_keys:
+            logging.warning(
+                f"Merging heterogeneous or incomplete video encoder metadata for feature {vk}. "
+                f"Setting these keys to null: {fallback_keys}.",
+            )
+
+        merged_info[vk]["info"] = {**base_video_info, **merged_encoder_info}
+        # TODO(CarolinePascal): make this variable once we have support for other video backends.
+        merged_info[vk]["info"]["video.video_backend"] = "pyav"
+
+    return merged_info
+
+
 def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
     """Validates that all dataset metadata have consistent properties.
 
     Ensures all datasets have the same fps, robot_type, and features to guarantee
     compatibility when aggregating them into a single dataset.
+    Video encoder info is not considered for validation but is merged during aggregation in ``merge_video_feature_info_for_aggregate``.
 
     Args:
         all_metadata: List of LeRobotDatasetMetadata objects to validate.
@@ -74,7 +120,7 @@ def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
             raise ValueError(
                 f"Same robot_type is expected, but got robot_type={meta.robot_type} instead of {robot_type}."
             )
-        if features != meta.features:
+        if not features_equal_for_merge(features, meta.features):
             raise ValueError(
                 f"Same features is expected, but got features={meta.features} instead of {features}."
             )
@@ -274,7 +320,8 @@ def aggregate_datasets(
             LeRobotDatasetMetadata(repo_id, root=root) for repo_id, root in zip(repo_ids, roots, strict=False)
         ]
     )
-    fps, robot_type, features = validate_all_metadata(all_metadata)
+    fps, robot_type, _ = validate_all_metadata(all_metadata)
+    features = merge_video_feature_info_for_aggregate(all_metadata)
     video_keys = [key for key in features if features[key]["dtype"] == "video"]
 
     dst_meta = LeRobotDatasetMetadata.create(
@@ -332,7 +379,6 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
         videos_idx: Dictionary tracking video chunk and file indices.
         video_files_size_in_mb: Maximum size for video files in MB (defaults to DEFAULT_VIDEO_FILE_SIZE_IN_MB)
         chunk_size: Maximum number of files per chunk (defaults to DEFAULT_CHUNK_SIZE)
-
     Returns:
         dict: Updated videos_idx with current chunk and file indices.
     """
@@ -414,9 +460,11 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
                 current_dst_duration = dst_file_durations.get(dst_key, 0)
                 videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_dst_duration
                 videos_idx[key]["src_to_dst"][(src_chunk_idx, src_file_idx)] = dst_key
+                # TODO(CarolinePascal): Move the check before the loop to avoid failing in the middle + add possibility to re-encode the video if the check fails
                 concatenate_video_files(
                     [dst_path, src_path],
                     dst_path,
+                    compatibility_check=True,
                 )
                 # Update duration of this destination file
                 dst_file_durations[dst_key] = current_dst_duration + src_duration
diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py
index b404ddb18..3c58774c3 100644
--- a/src/lerobot/datasets/dataset_metadata.py
+++ b/src/lerobot/datasets/dataset_metadata.py
@@ -24,6 +24,7 @@ import pyarrow as pa
 import pyarrow.parquet as pq
 from huggingface_hub import snapshot_download
 
+from lerobot.configs import VideoEncoderConfig
 from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
 from lerobot.utils.feature_utils import _validate_feature_names
 from lerobot.utils.utils import flatten_dict
@@ -534,10 +535,23 @@ class LeRobotDatasetMetadata:
         self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
         write_stats(self.stats, self.root)
 
-    def update_video_info(self, video_key: str | None = None) -> None:
-        """
+    def update_video_info(
+        self,
+        video_key: str | None = None,
+        camera_encoder: VideoEncoderConfig | None = None,
+    ) -> None:
+        """Populate per-feature video info in ``info.json``.
+
         Warning: this function writes info from first episode videos, implicitly assuming that all videos have
         been encoded the same way. Also, this means it assumes the first episode exists.
+
+        Args:
+            video_key: If provided, only update this video key. Otherwise update
+                all video keys in the dataset.
+            camera_encoder: Encoder configuration used to produce the
+                videos. When provided, its fields are recorded as
+                ``video.<field>`` entries alongside the stream-derived
+                ``video.*`` entries (see :func:`get_video_info`).
         """
         if video_key is not None and video_key not in self.video_keys:
             raise ValueError(f"Video key {video_key} not found in dataset")
@@ -546,7 +560,7 @@ class LeRobotDatasetMetadata:
         for key in video_keys:
             if not self.features[key].get("info", None):
                 video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
-                self.info.features[key]["info"] = get_video_info(video_path)
+                self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
 
     def update_chunk_settings(
         self,
diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py
index 46dd9bff2..489914fbc 100644
--- a/src/lerobot/datasets/dataset_tools.py
+++ b/src/lerobot/datasets/dataset_tools.py
@@ -36,6 +36,7 @@ import pyarrow.parquet as pq
 import torch
 from tqdm import tqdm
 
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
 from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE
 from lerobot.utils.utils import flatten_dict
 
@@ -62,7 +63,10 @@ from .utils import (
     DEFAULT_EPISODES_PATH,
     update_chunk_file_indices,
 )
-from .video_utils import encode_video_frames, get_video_info
+from .video_utils import (
+    encode_video_frames,
+    get_video_info,
+)
 
 
 def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
@@ -95,6 +99,11 @@ def delete_episodes(
 ) -> LeRobotDataset:
     """Delete episodes from a LeRobotDataset and create a new dataset.
 
+    Video segments that need re-encoding (because the source file mixes kept and
+    deleted episodes) are re-encoded with the source dataset's existing encoder
+    settings — read back from ``meta/info.json`` — so the output dataset stays
+    consistent with its own metadata.
+
     Args:
         dataset: The source LeRobotDataset.
         episode_indices: List of episode indices to delete.
@@ -157,6 +166,11 @@ def split_dataset(
 ) -> dict[str, LeRobotDataset]:
     """Split a LeRobotDataset into multiple smaller datasets.
 
+    Video segments that need re-encoding (because the source file mixes episodes
+    that fall into different splits) are re-encoded with the source dataset's
+    existing encoder settings — read back from ``meta/info.json`` — so each
+    output split stays consistent with its own metadata.
+
     Args:
         dataset: The source LeRobotDataset to split.
         splits: Either a dict mapping split names to episode indices, or a dict mapping
@@ -578,8 +592,7 @@ def _keep_episodes_from_video_with_av(
     output_path: Path,
     episodes_to_keep: list[tuple[int, int]],
     fps: float,
-    vcodec: str = "libsvtav1",
-    pix_fmt: str = "yuv420p",
+    camera_encoder: VideoEncoderConfig,
 ) -> None:
     """Keep only specified episodes from a video file using PyAV.
 
@@ -593,8 +606,7 @@ def _keep_episodes_from_video_with_av(
             Ranges are half-open intervals: [start_frame, end_frame), where start_frame
             is inclusive and end_frame is exclusive.
         fps: Frame rate of the video.
-        vcodec: Video codec to use for encoding.
-        pix_fmt: Pixel format for output video.
+        camera_encoder: Video encoder settings used to re-encode the kept frames.
     """
     from fractions import Fraction
 
@@ -619,12 +631,13 @@ def _keep_episodes_from_video_with_av(
 
     # Convert fps to Fraction for PyAV compatibility.
     fps_fraction = Fraction(fps).limit_denominator(1000)
-    v_out = out.add_stream(vcodec, rate=fps_fraction)
+    codec_options = camera_encoder.get_codec_options(as_strings=True)
+    v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
 
     # PyAV type stubs don't distinguish video streams from audio/subtitle streams.
     v_out.width = v_in.codec_context.width
     v_out.height = v_in.codec_context.height
-    v_out.pix_fmt = pix_fmt
+    v_out.pix_fmt = camera_encoder.pix_fmt
 
     # Set time_base to match the frame rate for proper timestamp handling.
     v_out.time_base = Fraction(1, int(fps))
@@ -687,14 +700,14 @@ def _copy_and_reindex_videos(
     src_dataset: LeRobotDataset,
     dst_meta: LeRobotDatasetMetadata,
     episode_mapping: dict[int, int],
-    vcodec: str = "libsvtav1",
-    pix_fmt: str = "yuv420p",
 ) -> dict[int, dict]:
     """Copy and filter video files, only re-encoding files with deleted episodes.
 
     For video files that only contain kept episodes, we copy them directly.
     For files with mixed kept/deleted episodes, we use PyAV filters to efficiently
-    re-encode only the desired segments.
+    re-encode only the desired segments. The encoder used for re-encoding is
+    derived per video key from the source dataset's ``meta/info.json`` so the
+    destination metadata keeps describing the videos accurately.
 
     Args:
         src_dataset: Source dataset to copy from
@@ -711,6 +724,9 @@ def _copy_and_reindex_videos(
 
     for video_key in src_dataset.meta.video_keys:
         logging.info(f"Processing videos for {video_key}")
+        camera_encoder = VideoEncoderConfig.from_video_info(
+            src_dataset.meta.info.features.get(video_key, {}).get("info")
+        )
 
         if dst_meta.video_path is None:
             raise ValueError("Destination metadata has no video_path defined")
@@ -792,8 +808,7 @@ def _copy_and_reindex_videos(
                     dst_video_path,
                     episodes_to_keep_ranges,
                     src_dataset.meta.fps,
-                    vcodec,
-                    pix_fmt,
+                    camera_encoder,
                 )
 
                 cumulative_ts = 0.0
@@ -1264,11 +1279,7 @@ def _estimate_frame_size_via_calibration(
     episode_indices: list[int],
     temp_dir: Path,
     fps: int,
-    vcodec: str,
-    pix_fmt: str,
-    g: int,
-    crf: int,
-    fast_decode: int,
+    camera_encoder: VideoEncoderConfig,
     num_calibration_frames: int = 30,
 ) -> float:
     """Estimate MB per frame by encoding a small calibration sample.
@@ -1282,11 +1293,7 @@ def _estimate_frame_size_via_calibration(
         episode_indices: List of episode indices being processed.
         temp_dir: Temporary directory for calibration files.
         fps: Frames per second for video encoding.
-        vcodec: Video codec (libsvtav1, h264, hevc).
-        pix_fmt: Pixel format (yuv420p, etc.).
-        g: GOP size (group of pictures).
-        crf: Constant Rate Factor (quality).
-        fast_decode: Fast decode tuning parameter.
+        camera_encoder: Video encoder settings used for calibration encoding.
         num_calibration_frames: Number of frames to use for calibration (default: 30).
 
     Returns:
@@ -1322,11 +1329,7 @@ def _estimate_frame_size_via_calibration(
             imgs_dir=calibration_dir,
             video_path=calibration_video_path,
             fps=fps,
-            vcodec=vcodec,
-            pix_fmt=pix_fmt,
-            g=g,
-            crf=crf,
-            fast_decode=fast_decode,
+            camera_encoder=camera_encoder,
             overwrite=True,
         )
 
@@ -1644,11 +1647,7 @@ def convert_image_to_video_dataset(
     dataset: LeRobotDataset,
     output_dir: Path | None = None,
     repo_id: str | None = None,
-    vcodec: str = "libsvtav1",
-    pix_fmt: str = "yuv420p",
-    g: int = 2,
-    crf: int = 30,
-    fast_decode: int = 0,
+    camera_encoder: VideoEncoderConfig | None = None,
     episode_indices: list[int] | None = None,
     num_workers: int = 4,
     max_episodes_per_batch: int | None = None,
@@ -1663,11 +1662,8 @@ def convert_image_to_video_dataset(
         dataset: The source LeRobot dataset with images
         output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
         repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
-        vcodec: Video codec (default: libsvtav1)
-        pix_fmt: Pixel format (default: yuv420p)
-        g: Group of pictures size (default: 2)
-        crf: Constant rate factor (default: 30)
-        fast_decode: Fast decode tuning (default: 0)
+        camera_encoder: Video encoder settings
+            (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
         episode_indices: List of episode indices to convert (None = all episodes)
         num_workers: Number of threads for parallel processing (default: 4)
         max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1676,6 +1672,9 @@ def convert_image_to_video_dataset(
     Returns:
         New LeRobotDataset with images encoded as videos
     """
+    if camera_encoder is None:
+        camera_encoder = camera_encoder_defaults()
+
     # Check that it's an image dataset
     if len(dataset.meta.video_keys) > 0:
         raise ValueError(
@@ -1699,7 +1698,10 @@ def convert_image_to_video_dataset(
     logging.info(
         f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
     )
-    logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}")
+    logging.info(
+        f"Video codec: {camera_encoder.vcodec}, pixel format: {camera_encoder.pix_fmt}, "
+        f"GOP: {camera_encoder.g}, CRF: {camera_encoder.crf}"
+    )
 
     # Create new features dict, converting image features to video features
     new_features = {}
@@ -1769,11 +1771,7 @@ def convert_image_to_video_dataset(
                 episode_indices=episode_indices,
                 temp_dir=temp_dir,
                 fps=fps,
-                vcodec=vcodec,
-                pix_fmt=pix_fmt,
-                g=g,
-                crf=crf,
-                fast_decode=fast_decode,
+                camera_encoder=camera_encoder,
             )
 
             logging.info(f"Processing camera: {img_key}")
@@ -1815,11 +1813,7 @@ def convert_image_to_video_dataset(
                     imgs_dir=imgs_dir,
                     video_path=video_path,
                     fps=fps,
-                    vcodec=vcodec,
-                    pix_fmt=pix_fmt,
-                    g=g,
-                    crf=crf,
-                    fast_decode=fast_decode,
+                    camera_encoder=camera_encoder,
                     overwrite=True,
                 )
 
@@ -1865,7 +1859,9 @@ def convert_image_to_video_dataset(
                 video_path = new_meta.root / new_meta.video_path.format(
                     video_key=img_key, chunk_index=0, file_index=0
                 )
-                new_meta.info.features[img_key]["info"] = get_video_info(video_path)
+                new_meta.info.features[img_key]["info"] = get_video_info(
+                    video_path, camera_encoder=camera_encoder
+                )
 
         write_info(new_meta.info, new_meta.root)
 
diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py
index cf306a86a..6be63194f 100644
--- a/src/lerobot/datasets/dataset_writer.py
+++ b/src/lerobot/datasets/dataset_writer.py
@@ -31,6 +31,8 @@ import PIL.Image
 import pyarrow.parquet as pq
 import torch
 
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
+
 from .compute_stats import compute_episode_stats
 from .dataset_metadata import LeRobotDatasetMetadata
 from .feature_utils import (
@@ -65,14 +67,19 @@ def _encode_video_worker(
     episode_index: int,
     root: Path,
     fps: int,
-    vcodec: str = "libsvtav1",
+    camera_encoder: VideoEncoderConfig | None = None,
     encoder_threads: int | None = None,
 ) -> Path:
     temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
     fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
     img_dir = (root / fpath).parent
     encode_video_frames(
-        img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
+        img_dir,
+        temp_path,
+        fps,
+        camera_encoder=camera_encoder,
+        encoder_threads=encoder_threads,
+        overwrite=True,
     )
     shutil.rmtree(img_dir)
     return temp_path
@@ -89,20 +96,22 @@ class DatasetWriter:
         self,
         meta: LeRobotDatasetMetadata,
         root: Path,
-        vcodec: str,
+        camera_encoder: VideoEncoderConfig | None,
         encoder_threads: int | None,
         batch_encoding_size: int,
         streaming_encoder: StreamingVideoEncoder | None = None,
         initial_frames: int = 0,
     ):
-        """Initialize the writer with metadata, codec, and encoding config.
+        """Initialize the writer with metadata, codec, and encoder config.
 
         Args:
             meta: Dataset metadata instance (used for feature schema, chunk
                 settings, and episode persistence).
             root: Local dataset root directory.
-            vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
-            encoder_threads: Threads per encoder instance. ``None`` for auto.
+            camera_encoder: Video encoder settings applied to all cameras.
+                ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
+            encoder_threads: Number of encoder threads (global). ``None``
+                lets the codec decide.
             batch_encoding_size: Number of episodes to accumulate before
                 batch-encoding videos.
             streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
@@ -111,7 +120,7 @@ class DatasetWriter:
         """
         self._meta = meta
         self._root = root
-        self._vcodec = vcodec
+        self._camera_encoder = camera_encoder or camera_encoder_defaults()
         self._encoder_threads = encoder_threads
         self._batch_encoding_size = batch_encoding_size
         self._streaming_encoder = streaming_encoder
@@ -284,7 +293,7 @@ class DatasetWriter:
                             episode_index,
                             self._root,
                             self._meta.fps,
-                            self._vcodec,
+                            self._camera_encoder,
                             self._encoder_threads,
                         ): video_key
                         for video_key in self._meta.video_keys
@@ -495,7 +504,7 @@ class DatasetWriter:
 
         # Update video info (only needed when first episode is encoded)
         if episode_index == 0:
-            self._meta.update_video_info(video_key)
+            self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder)
             write_info(self._meta.info, self._meta.root)
 
         metadata = {
@@ -564,7 +573,12 @@ class DatasetWriter:
     def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
         """Use ffmpeg to convert frames stored as png into mp4 videos."""
         return _encode_video_worker(
-            video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
+            video_key,
+            episode_index,
+            self._root,
+            self._meta.fps,
+            self._camera_encoder,
+            self._encoder_threads,
         )
 
     def close_writer(self) -> None:
diff --git a/src/lerobot/datasets/feature_utils.py b/src/lerobot/datasets/feature_utils.py
index 2ab4b0ea6..d5a550a4c 100644
--- a/src/lerobot/datasets/feature_utils.py
+++ b/src/lerobot/datasets/feature_utils.py
@@ -19,6 +19,7 @@ import datasets
 import numpy as np
 from PIL import Image as PILImage
 
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
 from lerobot.utils.constants import DEFAULT_FEATURES
 from lerobot.utils.utils import is_valid_numpy_dtype_string
 
@@ -108,6 +109,41 @@ def create_empty_dataset_info(
     )
 
 
+def features_equal_for_merge(features_a: dict[str, dict], features_b: dict[str, dict]) -> bool:
+    """Return whether two LeRobotDatasetMetadata ``features`` dicts are compatible for aggregation.
+
+    For video features, keys under ``info`` related to video encoding parameters are ignored during
+    comparison as they do not prevent aggregation.
+    """
+
+    def _without_encoder_info_keys(feature: dict) -> dict:
+        filtered = dict(feature)
+        filtered_info = filtered.get("info")
+        if isinstance(filtered_info, dict):
+            filtered["info"] = {
+                info_key: info_value
+                for info_key, info_value in filtered_info.items()
+                if info_key not in VIDEO_ENCODER_INFO_KEYS
+            }
+        return filtered
+
+    if set(features_a) != set(features_b):
+        return False
+    for key in features_a:
+        fa_key = features_a[key]
+        fb_key = features_b[key]
+        if fa_key.get("dtype") != fb_key.get("dtype"):
+            return False
+        if fa_key.get("dtype") != "video":
+            if fa_key != fb_key:
+                return False
+            continue
+
+        if _without_encoder_info_keys(fa_key) != _without_encoder_info_keys(fb_key):
+            return False
+    return True
+
+
 def check_delta_timestamps(
     delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True
 ) -> bool:
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index ab55aa9f8..9734bcc74 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -24,6 +24,7 @@ import torch.utils
 from huggingface_hub import HfApi, snapshot_download
 from huggingface_hub.errors import RevisionNotFoundError
 
+from lerobot.configs import VideoEncoderConfig
 from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE
 
 from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -36,8 +37,7 @@ from .utils import (
 )
 from .video_utils import (
     StreamingVideoEncoder,
-    get_safe_default_codec,
-    resolve_vcodec,
+    get_safe_default_video_backend,
 )
 
 logger = logging.getLogger(__name__)
@@ -59,10 +59,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
         video_backend: str | None = None,
         return_uint8: bool = False,
         batch_encoding_size: int = 1,
-        vcodec: str = "libsvtav1",
+        camera_encoder: VideoEncoderConfig | None = None,
+        encoder_threads: int | None = None,
         streaming_encoding: bool = False,
         encoder_queue_maxsize: int = 30,
-        encoder_threads: int | None = None,
     ):
         """
         2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -183,16 +183,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
             batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
                 Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
-            vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
-                'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
-                Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
+            camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
+                (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
+                is used by the writer.
+            encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
+                codec decide.
             streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
                 instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
             encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
                 streaming encoding. Defaults to 30 (~1s at 30fps).
-            encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
-                codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
-                libsvtav1 and 'threads' for h264/hevc.
 
         Note:
             Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
@@ -207,10 +206,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
         self.delta_timestamps = delta_timestamps
         self.tolerance_s = tolerance_s
         self.revision = revision if revision else CODEBASE_VERSION
-        self._video_backend = video_backend if video_backend else get_safe_default_codec()
+        self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
         self._return_uint8 = return_uint8
         self._batch_encoding_size = batch_encoding_size
-        self._vcodec = resolve_vcodec(vcodec)
         self._encoder_threads = encoder_threads
 
         if self._requested_root is not None:
@@ -273,12 +271,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
             streaming_enc = None
             if streaming_encoding and len(self.meta.video_keys) > 0:
                 streaming_enc = self._build_streaming_encoder(
-                    self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads
+                    self.meta.fps,
+                    camera_encoder,
+                    encoder_queue_maxsize,
+                    encoder_threads,
                 )
             self.writer = DatasetWriter(
                 meta=self.meta,
                 root=self.root,
-                vcodec=self._vcodec,
+                camera_encoder=camera_encoder,
                 encoder_threads=encoder_threads,
                 batch_encoding_size=batch_encoding_size,
                 streaming_encoder=streaming_enc,
@@ -320,17 +321,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
     @staticmethod
     def _build_streaming_encoder(
         fps: int,
-        vcodec: str,
+        camera_encoder: VideoEncoderConfig | None,
         encoder_queue_maxsize: int,
         encoder_threads: int | None,
     ) -> StreamingVideoEncoder:
         return StreamingVideoEncoder(
             fps=fps,
-            vcodec=vcodec,
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=None,
+            camera_encoder=camera_encoder,
             queue_maxsize=encoder_queue_maxsize,
             encoder_threads=encoder_threads,
         )
@@ -647,7 +644,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         image_writer_threads: int = 0,
         video_backend: str | None = None,
         batch_encoding_size: int = 1,
-        vcodec: str = "libsvtav1",
+        camera_encoder: VideoEncoderConfig | None = None,
         metadata_buffer_size: int = 10,
         streaming_encoding: bool = False,
         encoder_queue_maxsize: int = 30,
@@ -678,20 +675,20 @@ class LeRobotDataset(torch.utils.data.Dataset):
             video_backend: Video decoding backend (used when reading back).
             batch_encoding_size: Number of episodes to accumulate before
                 batch-encoding videos. ``1`` means encode immediately.
-            vcodec: Video codec for encoding. Options include ``'libsvtav1'``,
-                ``'h264'``, ``'hevc'``, ``'auto'``.
+            camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
+                When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+            encoder_threads: Number of encoder threads (global). ``None``
+                lets the codec decide.
             metadata_buffer_size: Number of episode metadata records to buffer
                 before flushing to parquet.
             streaming_encoding: If ``True``, encode video frames in real-time
                 during capture instead of writing images first.
             encoder_queue_maxsize: Max buffered frames per camera when using
                 streaming encoding.
-            encoder_threads: Threads per encoder instance. ``None`` for auto.
 
         Returns:
             A new :class:`LeRobotDataset` in write mode.
         """
-        vcodec = resolve_vcodec(vcodec)
         obj = cls.__new__(cls)
         obj.meta = LeRobotDatasetMetadata.create(
             repo_id=repo_id,
@@ -712,23 +709,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
         obj.image_transforms = None
         obj.delta_timestamps = None
         obj.episodes = None
-        obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
+        obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
         obj._return_uint8 = False
         obj._batch_encoding_size = batch_encoding_size
-        obj._vcodec = vcodec
         obj._encoder_threads = encoder_threads
 
         # Reader is lazily created on first access (write-only mode)
         obj.reader = None
 
-        # Create writer
         streaming_enc = None
         if streaming_encoding and len(obj.meta.video_keys) > 0:
-            streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads)
+            streaming_enc = cls._build_streaming_encoder(
+                fps, camera_encoder, encoder_queue_maxsize, encoder_threads
+            )
         obj.writer = DatasetWriter(
             meta=obj.meta,
             root=obj.root,
-            vcodec=vcodec,
+            camera_encoder=camera_encoder,
             encoder_threads=encoder_threads,
             batch_encoding_size=batch_encoding_size,
             streaming_encoder=streaming_enc,
@@ -751,12 +748,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
         force_cache_sync: bool = False,
         video_backend: str | None = None,
         batch_encoding_size: int = 1,
-        vcodec: str = "libsvtav1",
+        camera_encoder: VideoEncoderConfig | None = None,
+        encoder_threads: int | None = None,
         image_writer_processes: int = 0,
         image_writer_threads: int = 0,
         streaming_encoding: bool = False,
         encoder_queue_maxsize: int = 30,
-        encoder_threads: int | None = None,
     ) -> "LeRobotDataset":
         """Resume recording on an existing dataset.
 
@@ -779,13 +776,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
             video_backend: Video decoding backend for reading back data.
             batch_encoding_size: Number of episodes to accumulate before
                 batch-encoding videos.
-            vcodec: Video codec for encoding.
+            camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
+                When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+            encoder_threads: Number of encoder threads (global). ``None``
+                lets the codec decide.
             image_writer_processes: Subprocesses for async image writing.
             image_writer_threads: Threads for async image writing.
             streaming_encoding: If ``True``, encode video in real-time during
                 capture.
             encoder_queue_maxsize: Max buffered frames per camera for streaming.
-            encoder_threads: Threads per encoder instance. ``None`` for auto.
 
         Returns:
             A :class:`LeRobotDataset` in write mode, ready to append episodes.
@@ -796,7 +795,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 "Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
                 "the shared cache. Please provide a local directory path."
             )
-        vcodec = resolve_vcodec(vcodec)
         obj = cls.__new__(cls)
         obj.repo_id = repo_id
         obj._requested_root = Path(root)
@@ -805,11 +803,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
         obj.image_transforms = None
         obj.delta_timestamps = None
         obj.episodes = None
-        obj._video_backend = video_backend if video_backend else get_safe_default_codec()
+        obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
         obj._return_uint8 = False
         obj._batch_encoding_size = batch_encoding_size
-        obj._vcodec = vcodec
-        obj._encoder_threads = encoder_threads
 
         if obj._requested_root is not None:
             obj._requested_root.mkdir(exist_ok=True, parents=True)
@@ -818,21 +814,22 @@ class LeRobotDataset(torch.utils.data.Dataset):
         obj.meta = LeRobotDatasetMetadata(
             obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
         )
+
+        obj._encoder_threads = encoder_threads
         obj.root = obj.meta.root
 
         # Reader is lazily created on first access (write-only mode)
         obj.reader = None
 
-        # Create writer for appending
         streaming_enc = None
         if streaming_encoding and len(obj.meta.video_keys) > 0:
             streaming_enc = cls._build_streaming_encoder(
-                obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads
+                obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads
             )
         obj.writer = DatasetWriter(
             meta=obj.meta,
             root=obj.root,
-            vcodec=vcodec,
+            camera_encoder=camera_encoder,
             encoder_threads=encoder_threads,
             batch_encoding_size=batch_encoding_size,
             streaming_encoder=streaming_enc,
diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py
new file mode 100644
index 000000000..d291f8b40
--- /dev/null
+++ b/src/lerobot/datasets/pyav_utils.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyAV-based compatibility checks for :class:`VideoEncoderConfig`.
+
+Centralises all :mod:`av` introspection of the bundled FFmpeg build.
+Checks degrade to a no-op when the target codec isn't available locally.
+"""
+
+import functools
+import logging
+from typing import Any
+
+import av
+
+logger = logging.getLogger(__name__)
+
+FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE")
+FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64")
+
+
+@functools.cache
+def get_codec(vcodec: str) -> av.codec.Codec | None:
+    """PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable."""
+    try:
+        return av.codec.Codec(vcodec, "w")
+    except Exception:
+        return None
+
+
+@functools.cache
+def _get_codec_options_by_name(vcodec: str) -> dict[str, av.option.Option]:
+    """Private-option name → PyAV ``Option`` for *vcodec* (empty if unavailable)."""
+    codec = get_codec(vcodec)
+    if codec is None:
+        return {}
+    return {opt.name: opt for opt in codec.descriptor.options}
+
+
+@functools.cache
+def _get_codec_video_formats(vcodec: str) -> tuple[str, ...]:
+    """Pixel formats accepted by *vcodec* in PyAV's preferred order (empty if unknown)."""
+    codec = get_codec(vcodec)
+    if codec is None:
+        return ()
+    return tuple(fmt.name for fmt in (codec.video_formats or []))
+
+
+def detect_available_encoders_pyav(encoders: list[str] | str) -> list[str]:
+    """Return the subset of *encoders* available as video encoders in the local FFmpeg build.
+
+    Each name is probed directly via :func:`get_codec`; input order is preserved.
+    """
+    if isinstance(encoders, str):
+        encoders = [encoders]
+
+    available: list[str] = []
+    for name in encoders:
+        codec = get_codec(name)
+        if codec is not None and codec.type == "video":
+            available.append(name)
+        else:
+            logger.debug("encoder '%s' not available as video encoder", name)
+    return available
+
+
+def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Option) -> None:
+    """Range-check numeric *value* and choice-check string *value* against *opt*."""
+    type_name = opt.type.name
+    if type_name in FFMPEG_NUMERIC_OPTION_TYPES:
+        if isinstance(value, bool):
+            raise ValueError(
+                f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+            )
+        elif isinstance(value, str):
+            try:
+                num_val = float(value)
+            except ValueError as e:
+                raise ValueError(
+                    f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+                ) from e
+        elif isinstance(value, (float, int)):
+            num_val = value
+        else:
+            raise ValueError(
+                f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+            )
+
+        # Check integer type compatibility
+        if type_name in FFMPEG_INTEGER_OPTION_TYPES and not num_val.is_integer():
+            raise ValueError(
+                f"{label}={num_val!r} must be an integer for codec {vcodec!r} "
+                f"(FFmpeg option {opt.name!r} is {type_name}); float values are not allowed."
+            )
+
+        # Check numeric range compatibility
+        lo, hi = float(opt.min), float(opt.max)
+        if lo < hi and not (lo <= num_val <= hi):
+            raise ValueError(
+                f"{label}={num_val} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]"
+            )
+
+    elif type_name == "STRING":
+        if isinstance(value, bool):
+            raise ValueError(f"{label}={value!r} is not a valid string value for codec {vcodec!r}.")
+        if isinstance(value, str):
+            str_val = value
+        elif isinstance(value, (int, float)):
+            str_val = str(value)
+        else:
+            raise ValueError(f"{label}={value!r} has unsupported type for STRING option on codec {vcodec!r}")
+
+        # Check string choice compatibility
+        choices = [c.name for c in (opt.choices or [])]
+        if choices and str_val not in choices:
+            raise ValueError(
+                f"{label}={str_val!r} is not a supported choice for codec "
+                f"{vcodec!r}; valid choices: {choices}"
+            )
+    else:
+        return
+
+
+def _check_pixel_format(vcodec: str, pix_fmt: str) -> None:
+    formats = _get_codec_video_formats(vcodec)
+    if formats and pix_fmt not in formats:
+        raise ValueError(
+            f"pix_fmt={pix_fmt!r} is not supported by codec {vcodec!r}; "
+            f"supported pixel formats: {list(formats)}"
+        )
+
+
+def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None:
+    """Validate merged encoder options (typed) against the codec's published AVOptions."""
+    supported_options = _get_codec_options_by_name(vcodec)
+    for key, value in codec_options.items():
+        # GOP size is not a codec-specific option, it has to be validated separately.
+        if key == "g":
+            if isinstance(value, bool) or not isinstance(value, int) or value < 1:
+                raise ValueError(f"g={value!r} must be a positive integer for codec {vcodec!r}")
+            continue
+        if key not in supported_options:
+            continue
+        _check_option_value(vcodec, key, value, supported_options[key])
+
+
+def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options: dict[str, Any]) -> None:
+    """Verify *config* is compatible with the bundled FFmpeg build.
+
+    Checks pixel format, abstract tuning-field compatibility, and each merged
+    encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options`
+    against PyAV (including numeric ``extra_options`` present in that dict).
+    No-op when ``config.vcodec`` isn't in the local FFmpeg build.
+
+    Raises:
+        ValueError: on the first incompatibility encountered.
+    """
+    options = _get_codec_options_by_name(vcodec)
+    if not options:
+        raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build")
+    _check_pixel_format(vcodec, pix_fmt)
+    _check_codec_options(vcodec, codec_options)
diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py
index 00ff09ee7..e823a406c 100644
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 import threading
 import warnings
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from fractions import Fraction
 from pathlib import Path
 from threading import Lock
@@ -36,86 +36,14 @@ import torch
 from datasets.features.features import register_feature
 from PIL import Image
 
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.configs import (
+    VideoEncoderConfig,
+    camera_encoder_defaults,
+)
+from lerobot.utils.import_utils import get_safe_default_video_backend
 
 logger = logging.getLogger(__name__)
 
-# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build.
-# Determines the order of preference for auto-selection when vcodec="auto" is used.
-HW_ENCODERS = [
-    "h264_videotoolbox",  # macOS
-    "hevc_videotoolbox",  # macOS
-    "h264_nvenc",  # NVIDIA GPU
-    "hevc_nvenc",  # NVIDIA GPU
-    "h264_vaapi",  # Linux Intel/AMD
-    "h264_qsv",  # Intel Quick Sync
-]
-
-VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS)
-
-
-def _get_codec_options(
-    vcodec: str,
-    g: int | None = 2,
-    crf: int | None = 30,
-    preset: int | None = None,
-) -> dict:
-    """Build codec-specific options dict for video encoding."""
-    options = {}
-
-    # GOP size (keyframe interval) - supported by VideoToolbox and software encoders
-    if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS):
-        options["g"] = str(g)
-
-    # Quality control (codec-specific parameter names)
-    if crf is not None:
-        if vcodec in ("h264", "hevc", "libsvtav1"):
-            options["crf"] = str(crf)
-        elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
-            quality = max(1, min(100, int(100 - crf * 2)))
-            options["q:v"] = str(quality)
-        elif vcodec in ("h264_nvenc", "hevc_nvenc"):
-            options["rc"] = "constqp"
-            options["qp"] = str(crf)
-        elif vcodec in ("h264_vaapi",):
-            options["qp"] = str(crf)
-        elif vcodec in ("h264_qsv",):
-            options["global_quality"] = str(crf)
-
-    # Preset (only for libsvtav1)
-    if vcodec == "libsvtav1":
-        options["preset"] = str(preset) if preset is not None else "12"
-
-    return options
-
-
-def detect_available_hw_encoders() -> list[str]:
-    """Probe PyAV/FFmpeg for available hardware video encoders."""
-    available = []
-    for codec_name in HW_ENCODERS:
-        try:
-            av.codec.Codec(codec_name, "w")
-            available.append(codec_name)
-        except Exception:  # nosec B110
-            logger.debug("HW encoder '%s' not available", codec_name)  # nosec B110
-    return available
-
-
-def resolve_vcodec(vcodec: str) -> str:
-    """Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1."""
-    if vcodec not in VALID_VIDEO_CODECS:
-        raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
-    if vcodec != "auto":
-        logger.info(f"Using video codec: {vcodec}")
-        return vcodec
-    available = detect_available_hw_encoders()
-    for encoder in HW_ENCODERS:
-        if encoder in available:
-            logger.info(f"Auto-selected video codec: {encoder}")
-            return encoder
-    logger.info("No hardware encoder available, falling back to software encoder 'libsvtav1'")
-    return "libsvtav1"
-
 
 def decode_video_frames(
     video_path: Path | str,
@@ -143,7 +71,7 @@ def decode_video_frames(
     Currently supports torchcodec on cpu and pyav.
     """
     if backend is None:
-        backend = get_safe_default_codec()
+        backend = get_safe_default_video_backend()
     if backend == "torchcodec":
         return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8)
     elif backend == "pyav":
@@ -407,18 +335,17 @@ def encode_video_frames(
     imgs_dir: Path | str,
     video_path: Path | str,
     fps: int,
-    vcodec: str = "libsvtav1",
-    pix_fmt: str = "yuv420p",
-    g: int | None = 2,
-    crf: int | None = 30,
-    fast_decode: int = 0,
+    camera_encoder: VideoEncoderConfig | None = None,
+    encoder_threads: int | None = None,
+    *,
     log_level: int | None = av.logging.WARNING,
     overwrite: bool = False,
-    preset: int | None = None,
-    encoder_threads: int | None = None,
 ) -> None:
     """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
-    vcodec = resolve_vcodec(vcodec)
+    if camera_encoder is None:
+        camera_encoder = camera_encoder_defaults()
+    vcodec = camera_encoder.vcodec
+    pix_fmt = camera_encoder.pix_fmt
 
     video_path = Path(video_path)
     imgs_dir = Path(imgs_dir)
@@ -429,42 +356,18 @@ def encode_video_frames(
 
     video_path.parent.mkdir(parents=True, exist_ok=True)
 
-    # Encoders/pixel formats incompatibility check
-    if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
-        logger.warning(
-            f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
-        )
-        pix_fmt = "yuv420p"
-
     # Get input frames
     template = "frame-" + ("[0-9]" * 6) + ".png"
     input_list = sorted(
         glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0])
     )
 
-    # Define video output frame size (assuming all input frames are the same size)
     if len(input_list) == 0:
         raise FileNotFoundError(f"No images found in {imgs_dir}.")
     with Image.open(input_list[0]) as dummy_image:
         width, height = dummy_image.size
 
-    # Define video codec options
-    video_options = _get_codec_options(vcodec, g, crf, preset)
-
-    if fast_decode:
-        key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
-        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
-        video_options[key] = value
-
-    if encoder_threads is not None:
-        if vcodec == "libsvtav1":
-            lp_param = f"lp={encoder_threads}"
-            if "svtav1-params" in video_options:
-                video_options["svtav1-params"] += f":{lp_param}"
-            else:
-                video_options["svtav1-params"] = lp_param
-        else:
-            video_options["threads"] = str(encoder_threads)
+    video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True)
 
     # Set logging level
     if log_level is not None:
@@ -501,7 +404,10 @@ def encode_video_frames(
 
 
 def concatenate_video_files(
-    input_video_paths: list[Path | str], output_video_path: Path, overwrite: bool = True
+    input_video_paths: list[Path | str],
+    output_video_path: Path,
+    overwrite: bool = True,
+    compatibility_check: bool = False,
 ):
     """
     Concatenate multiple video files into a single video file using pyav.
@@ -514,6 +420,7 @@ def concatenate_video_files(
         input_video_paths: Ordered list of input video file paths to concatenate.
         output_video_path: Path to the output video file.
         overwrite: Whether to overwrite the output video file if it already exists. Default is True.
+        compatibility_check: Whether to check if the input videos are compatible. Default is False.
 
     Note:
         - Creates a temporary directory for intermediate files that is cleaned up after use.
@@ -532,6 +439,22 @@ def concatenate_video_files(
     if len(input_video_paths) == 0:
         raise FileNotFoundError("No input video paths provided.")
 
+    # This check may be skipped at recording time as videos are encoded with the same encoder config.
+    if compatibility_check:
+        reference_video_info = get_video_info(input_video_paths[0])
+        for input_path in input_video_paths[1:]:
+            video_info = get_video_info(input_path)
+            if (
+                video_info["video.height"] != reference_video_info["video.height"]
+                or video_info["video.width"] != reference_video_info["video.width"]
+                or video_info["video.fps"] != reference_video_info["video.fps"]
+                or video_info["video.codec"] != reference_video_info["video.codec"]
+                or video_info["video.pix_fmt"] != reference_video_info["video.pix_fmt"]
+            ):
+                raise ValueError(
+                    f"Input video {input_path} is not compatible with the reference video {input_video_paths[0]}."
+                )
+
     # Create a temporary .ffconcat file to list the input video paths
     with tempfile.NamedTemporaryFile(mode="w", suffix=".ffconcat", delete=False) as tmp_concatenate_file:
         tmp_concatenate_file.write("ffconcat version 1.0\n")
@@ -598,26 +521,20 @@ class _CameraEncoderThread(threading.Thread):
         fps: int,
         vcodec: str,
         pix_fmt: str,
-        g: int | None,
-        crf: int | None,
-        preset: int | None,
+        codec_options: dict[str, str],
         frame_queue: queue.Queue,
         result_queue: queue.Queue,
         stop_event: threading.Event,
-        encoder_threads: int | None = None,
     ):
         super().__init__(daemon=True)
         self.video_path = video_path
         self.fps = fps
         self.vcodec = vcodec
         self.pix_fmt = pix_fmt
-        self.g = g
-        self.crf = crf
-        self.preset = preset
+        self.codec_options = codec_options
         self.frame_queue = frame_queue
         self.result_queue = result_queue
         self.stop_event = stop_event
-        self.encoder_threads = encoder_threads
 
     def run(self) -> None:
         from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -653,19 +570,9 @@ class _CameraEncoderThread(threading.Thread):
                 # Open container on first frame (to get width/height)
                 if container is None:
                     height, width = frame_data.shape[:2]
-                    video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset)
-                    if self.encoder_threads is not None:
-                        if self.vcodec == "libsvtav1":
-                            lp_param = f"lp={self.encoder_threads}"
-                            if "svtav1-params" in video_options:
-                                video_options["svtav1-params"] += f":{lp_param}"
-                            else:
-                                video_options["svtav1-params"] = lp_param
-                        else:
-                            video_options["threads"] = str(self.encoder_threads)
                     Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
                     container = av.open(str(self.video_path), "w")
-                    output_stream = container.add_stream(self.vcodec, self.fps, options=video_options)
+                    output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options)
                     output_stream.pix_fmt = self.pix_fmt
                     output_stream.width = width
                     output_stream.height = height
@@ -731,22 +638,24 @@ class StreamingVideoEncoder:
     def __init__(
         self,
         fps: int,
-        vcodec: str = "libsvtav1",
-        pix_fmt: str = "yuv420p",
-        g: int | None = 2,
-        crf: int | None = 30,
-        preset: int | None = None,
+        camera_encoder: VideoEncoderConfig | None = None,
         queue_maxsize: int = 30,
         encoder_threads: int | None = None,
     ):
+        """
+        Args:
+            fps: Frames per second for the output videos.
+            camera_encoder: Video encoder settings applied to all cameras.
+                When ``None``, :func:`camera_encoder_defaults` is used.
+            encoder_threads: Number of encoder threads (global setting).
+                ``None`` lets the codec decide.
+            queue_maxsize: Max frames to buffer per camera before
+                back-pressure drops frames.
+        """
         self.fps = fps
-        self.vcodec = resolve_vcodec(vcodec)
-        self.pix_fmt = pix_fmt
-        self.g = g
-        self.crf = crf
-        self.preset = preset
+        self._camera_encoder = camera_encoder or camera_encoder_defaults()
+        self._encoder_threads = encoder_threads
         self.queue_maxsize = queue_maxsize
-        self.encoder_threads = encoder_threads
 
         self._frame_queues: dict[str, queue.Queue] = {}
         self._result_queues: dict[str, queue.Queue] = {}
@@ -777,18 +686,17 @@ class StreamingVideoEncoder:
             temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
             video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
 
+            vcodec = self._camera_encoder.vcodec
+            codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True)
             encoder_thread = _CameraEncoderThread(
                 video_path=video_path,
                 fps=self.fps,
-                vcodec=self.vcodec,
-                pix_fmt=self.pix_fmt,
-                g=self.g,
-                crf=self.crf,
-                preset=self.preset,
+                vcodec=vcodec,
+                pix_fmt=self._camera_encoder.pix_fmt,
+                codec_options=codec_options,
                 frame_queue=frame_queue,
                 result_queue=result_queue,
                 stop_event=stop_event,
-                encoder_threads=self.encoder_threads,
             )
             encoder_thread.start()
 
@@ -993,8 +901,18 @@ def get_audio_info(video_path: Path | str) -> dict:
     return audio_info
 
 
-def get_video_info(video_path: Path | str) -> dict:
-    # Set logging level
+def get_video_info(
+    video_path: Path | str,
+    camera_encoder: VideoEncoderConfig | None = None,
+) -> dict:
+    """Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``.
+
+    Args:
+        video_path: Path to the encoded video file to probe.
+        camera_encoder: If provided, record the exact encoder settings used to encode this
+            video. Stream-derived values take precedence — encoder fields are only written for keys
+            not already populated from the video file itself.
+    """
     logging.getLogger("libav").setLevel(av.logging.WARNING)
 
     # Getting video stream information
@@ -1025,6 +943,14 @@ def get_video_info(video_path: Path | str) -> dict:
     # Adding audio stream information
     video_info.update(**get_audio_info(video_path))
 
+    # Add additional encoder configuration if provided
+    if camera_encoder is not None:
+        for field_name, field_value in asdict(camera_encoder).items():
+            # vcodec is already populated from the video stream
+            if field_name == "vcodec":
+                continue
+            video_info.setdefault(f"video.{field_name}", field_value)
+
     return video_info
 
 
diff --git a/src/lerobot/policies/eo1/modeling_eo1.py b/src/lerobot/policies/eo1/modeling_eo1.py
index 27d609ec1..1c5860de5 100644
--- a/src/lerobot/policies/eo1/modeling_eo1.py
+++ b/src/lerobot/policies/eo1/modeling_eo1.py
@@ -28,11 +28,12 @@ import torch.nn.functional as F  # noqa: N812
 import torch.utils.checkpoint
 from torch import Tensor
 
-from lerobot.policies.eo1.configuration_eo1 import EO1Config
-from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.utils.constants import ACTION, OBS_STATE
 from lerobot.utils.import_utils import _transformers_available, require_package
 
+from ..pretrained import PreTrainedPolicy
+from .configuration_eo1 import EO1Config
+
 if TYPE_CHECKING or _transformers_available:
     from transformers.activations import ACT2FN
     from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
diff --git a/src/lerobot/policies/eo1/processor_eo1.py b/src/lerobot/policies/eo1/processor_eo1.py
index 2d7bb48ae..b1f32756a 100644
--- a/src/lerobot/policies/eo1/processor_eo1.py
+++ b/src/lerobot/policies/eo1/processor_eo1.py
@@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Any
 import torch
 
 from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
-from lerobot.policies.eo1.configuration_eo1 import EO1Config
 from lerobot.processor import (
     AddBatchDimensionProcessorStep,
     ComplementaryDataProcessorStep,
@@ -44,6 +43,8 @@ from lerobot.utils.constants import (
 )
 from lerobot.utils.import_utils import _transformers_available, require_package
 
+from .configuration_eo1 import EO1Config
+
 if TYPE_CHECKING or _transformers_available:
     from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 else:
diff --git a/src/lerobot/rewards/classifier/modeling_classifier.py b/src/lerobot/rewards/classifier/modeling_classifier.py
index 1d8057135..ca02b532f 100644
--- a/src/lerobot/rewards/classifier/modeling_classifier.py
+++ b/src/lerobot/rewards/classifier/modeling_classifier.py
@@ -17,10 +17,11 @@ import logging
 import torch
 from torch import Tensor, nn
 
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
-from lerobot.rewards.pretrained import PreTrainedRewardModel
 from lerobot.utils.constants import OBS_IMAGE, REWARD
 
+from ..pretrained import PreTrainedRewardModel
+from .configuration_classifier import RewardClassifierConfig
+
 
 class ClassifierOutput:
     """Wrapper for classifier outputs with additional metadata."""
diff --git a/src/lerobot/rewards/classifier/processor_classifier.py b/src/lerobot/rewards/classifier/processor_classifier.py
index 056d7e91b..a5f609d0c 100644
--- a/src/lerobot/rewards/classifier/processor_classifier.py
+++ b/src/lerobot/rewards/classifier/processor_classifier.py
@@ -25,7 +25,8 @@ from lerobot.processor import (
     policy_action_to_transition,
     transition_to_policy_action,
 )
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
+
+from .configuration_classifier import RewardClassifierConfig
 
 
 def make_classifier_processor(
diff --git a/src/lerobot/rewards/factory.py b/src/lerobot/rewards/factory.py
index f6716f3fb..c173f44a5 100644
--- a/src/lerobot/rewards/factory.py
+++ b/src/lerobot/rewards/factory.py
@@ -22,9 +22,10 @@ import torch
 
 from lerobot.configs.rewards import RewardModelConfig
 from lerobot.processor import PolicyAction, PolicyProcessorPipeline
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
-from lerobot.rewards.pretrained import PreTrainedRewardModel
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
+
+from .classifier.configuration_classifier import RewardClassifierConfig
+from .pretrained import PreTrainedRewardModel
+from .sarm.configuration_sarm import SARMConfig
 
 
 def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
diff --git a/src/lerobot/rewards/sarm/compute_rabc_weights.py b/src/lerobot/rewards/sarm/compute_rabc_weights.py
index b1bf2e1f5..bdbb0d297 100644
--- a/src/lerobot/rewards/sarm/compute_rabc_weights.py
+++ b/src/lerobot/rewards/sarm/compute_rabc_weights.py
@@ -58,9 +58,10 @@ import torch
 from tqdm import tqdm
 
 from lerobot.datasets import LeRobotDataset
-from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
-from lerobot.rewards.sarm.processor_sarm import make_sarm_pre_post_processors
-from lerobot.rewards.sarm.sarm_utils import normalize_stage_tau
+
+from .modeling_sarm import SARMRewardModel
+from .processor_sarm import make_sarm_pre_post_processors
+from .sarm_utils import normalize_stage_tau
 
 
 def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
diff --git a/src/lerobot/rewards/sarm/modeling_sarm.py b/src/lerobot/rewards/sarm/modeling_sarm.py
index 365f519b2..5ebd42d30 100644
--- a/src/lerobot/rewards/sarm/modeling_sarm.py
+++ b/src/lerobot/rewards/sarm/modeling_sarm.py
@@ -32,13 +32,14 @@ import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor
 
-from lerobot.rewards.pretrained import PreTrainedRewardModel
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
-from lerobot.rewards.sarm.sarm_utils import (
+from lerobot.utils.constants import OBS_STR
+
+from ..pretrained import PreTrainedRewardModel
+from .configuration_sarm import SARMConfig
+from .sarm_utils import (
     normalize_stage_tau,
     pad_state_to_max_dim,
 )
-from lerobot.utils.constants import OBS_STR
 
 
 class StageTransformer(nn.Module):
diff --git a/src/lerobot/rewards/sarm/processor_sarm.py b/src/lerobot/rewards/sarm/processor_sarm.py
index eaa5f66f5..37db374d4 100644
--- a/src/lerobot/rewards/sarm/processor_sarm.py
+++ b/src/lerobot/rewards/sarm/processor_sarm.py
@@ -58,15 +58,16 @@ from lerobot.processor import (
     policy_action_to_transition,
     transition_to_policy_action,
 )
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
-from lerobot.rewards.sarm.sarm_utils import (
+from lerobot.types import EnvTransition, PolicyAction, TransitionKey
+from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
+
+from .configuration_sarm import SARMConfig
+from .sarm_utils import (
     apply_rewind_augmentation,
     compute_absolute_indices,
     find_stage_and_tau,
     pad_state_to_max_dim,
 )
-from lerobot.types import EnvTransition, PolicyAction, TransitionKey
-from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
 
 
 class SARMEncodingProcessorStep(ProcessorStep):
diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py
index 8804cd789..bf5fa0fd4 100644
--- a/src/lerobot/rollout/context.py
+++ b/src/lerobot/rollout/context.py
@@ -332,7 +332,7 @@ def build_rollout_context(
                 cfg.dataset.repo_id,
                 root=cfg.dataset.root,
                 batch_encoding_size=cfg.dataset.video_encoding_batch_size,
-                vcodec=cfg.dataset.vcodec,
+                camera_encoder=cfg.dataset.camera_encoder,
                 streaming_encoding=cfg.dataset.streaming_encoding,
                 encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                 encoder_threads=cfg.dataset.encoder_threads,
@@ -367,7 +367,7 @@ def build_rollout_context(
                 image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera
                 * len(robot.cameras if hasattr(robot, "cameras") else []),
                 batch_encoding_size=cfg.dataset.video_encoding_batch_size,
-                vcodec=cfg.dataset.vcodec,
+                camera_encoder=cfg.dataset.camera_encoder,
                 streaming_encoding=cfg.dataset.streaming_encoding,
                 encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
                 encoder_threads=cfg.dataset.encoder_threads,
diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py
index a708d37a3..eb6a57870 100644
--- a/src/lerobot/scripts/lerobot_edit_dataset.py
+++ b/src/lerobot/scripts/lerobot_edit_dataset.py
@@ -187,12 +187,12 @@ import abc
 import logging
 import shutil
 import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 
 import draccus
 
-from lerobot.configs import parser
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser
 from lerobot.datasets import (
     LeRobotDataset,
     convert_image_to_video_dataset,
@@ -250,11 +250,7 @@ class ModifyTasksConfig(OperationConfig):
 @dataclass
 class ConvertImageToVideoConfig(OperationConfig):
     output_dir: str | None = None
-    vcodec: str = "libsvtav1"
-    pix_fmt: str = "yuv420p"
-    g: int = 2
-    crf: int = 30
-    fast_decode: int = 0
+    camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
     episode_indices: list[int] | None = None
     num_workers: int = 4
     max_episodes_per_batch: int | None = None
@@ -557,11 +553,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
         dataset=dataset,
         output_dir=output_dir,
         repo_id=output_repo_id,
-        vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"),
-        pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"),
-        g=getattr(cfg.operation, "g", 2),
-        crf=getattr(cfg.operation, "crf", 30),
-        fast_decode=getattr(cfg.operation, "fast_decode", 0),
+        camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(),
         episode_indices=getattr(cfg.operation, "episode_indices", None),
         num_workers=getattr(cfg.operation, "num_workers", 4),
         max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py
index 129696bd3..c8419cb14 100644
--- a/src/lerobot/scripts/lerobot_record.py
+++ b/src/lerobot/scripts/lerobot_record.py
@@ -63,6 +63,27 @@ lerobot-record \\
   --dataset.streaming_encoding=true \\
   --dataset.encoder_threads=2
 ```
+
+Example recording with custom video encoding parameters:
+```shell
+lerobot-record \\
+    --robot.type=so100_follower \\
+    --robot.port=/dev/tty.usbmodem58760431541 \\
+    --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\
+    --robot.id=black \\
+    --teleop.type=so100_leader \\
+    --teleop.port=/dev/tty.usbmodem58760431551 \\
+    --teleop.id=blue \\
+    --dataset.repo_id=<my_username>/<my_dataset_name> \\
+    --dataset.num_episodes=2 \\
+    --dataset.single_task="Grab the cube" \\
+    --dataset.streaming_encoding=true \\
+    --dataset.encoder_threads=2 \\
+    --dataset.camera_encoder.vcodec=h264 \\
+    --dataset.camera_encoder.preset=fast \\
+    --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\
+    --display_data=true
+```
 """
 
 import logging
@@ -377,10 +398,10 @@ def record(
                 cfg.dataset.repo_id,
                 root=cfg.dataset.root,
                 batch_encoding_size=cfg.dataset.video_encoding_batch_size,
-                vcodec=cfg.dataset.vcodec,
+                camera_encoder=cfg.dataset.camera_encoder,
+                encoder_threads=cfg.dataset.encoder_threads,
                 streaming_encoding=cfg.dataset.streaming_encoding,
                 encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
-                encoder_threads=cfg.dataset.encoder_threads,
                 image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
                 image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
                 if num_cameras > 0
@@ -406,10 +427,10 @@ def record(
                 image_writer_processes=cfg.dataset.num_image_writer_processes,
                 image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
                 batch_encoding_size=cfg.dataset.video_encoding_batch_size,
-                vcodec=cfg.dataset.vcodec,
+                camera_encoder=cfg.dataset.camera_encoder,
+                encoder_threads=cfg.dataset.encoder_threads,
                 streaming_encoding=cfg.dataset.streaming_encoding,
                 encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
-                encoder_threads=cfg.dataset.encoder_threads,
             )
 
         robot.connect()
@@ -420,7 +441,7 @@ def record(
 
         if not cfg.dataset.streaming_encoding:
             logging.info(
-                "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
+                "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
             )
 
         with VideoEncodingManager(dataset):
diff --git a/src/lerobot/scripts/lerobot_rollout.py b/src/lerobot/scripts/lerobot_rollout.py
index 6a81563ee..7015e707c 100644
--- a/src/lerobot/scripts/lerobot_rollout.py
+++ b/src/lerobot/scripts/lerobot_rollout.py
@@ -120,6 +120,18 @@ Usage examples
         --dataset.repo_id=user/rollout_sentry_data \\
         --dataset.single_task="patrol" \\
         --resume=true
+
+    # Rollout with custom video encoding parameters
+    lerobot-rollout \\
+        --strategy.type=base \\
+        --policy.path=lerobot/act_koch_real \\
+        --robot.type=koch_follower \\
+        --robot.port=/dev/ttyACM0 \\
+        --task="pick up cube" --duration=60 \\
+        --display_data=true \\
+        --dataset.camera_encoder.vcodec=h264 \\
+        --dataset.camera_encoder.preset=fast \\
+        --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2}
 """
 
 import logging
diff --git a/src/lerobot/transport/utils.py b/src/lerobot/transport/utils.py
index 8da338044..2ef63c2cc 100644
--- a/src/lerobot/transport/utils.py
+++ b/src/lerobot/transport/utils.py
@@ -25,9 +25,10 @@ from typing import Any
 
 import torch
 
-from lerobot.transport import services_pb2
 from lerobot.utils.transition import Transition
 
+from . import services_pb2
+
 # FIX for protobuf: Assign the enum to a variable and ignore the type error once
 TransferState = services_pb2.TransferState  # type: ignore[attr-defined]
 
diff --git a/src/lerobot/utils/import_utils.py b/src/lerobot/utils/import_utils.py
index 6ba912bf5..ef03367eb 100644
--- a/src/lerobot/utils/import_utils.py
+++ b/src/lerobot/utils/import_utils.py
@@ -69,7 +69,7 @@ def is_package_available(
         return package_exists
 
 
-def get_safe_default_codec():
+def get_safe_default_video_backend():
     logger = logging.getLogger(__name__)
     if importlib.util.find_spec("torchcodec"):
         return "torchcodec"
@@ -128,6 +128,9 @@ _hidapi_available = is_package_available("hidapi", import_name="hid")
 _pandas_available = is_package_available("pandas")
 _faker_available = is_package_available("faker")
 
+# Video encoding / decoding
+_av_available = is_package_available("av")
+
 # Misc
 _pynput_available = is_package_available("pynput")
 _pygame_available = is_package_available("pygame")
diff --git a/tests/artifacts/encoded_videos/clip_32x48.mp4 b/tests/artifacts/encoded_videos/clip_32x48.mp4
new file mode 100644
index 000000000..086c399d3
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_32x48.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2191cd86e9e32ecbe18e33ad68d49060e479723ab5a3212bbb26df3025ccb568
+size 5815
diff --git a/tests/artifacts/encoded_videos/clip_4frames.mp4 b/tests/artifacts/encoded_videos/clip_4frames.mp4
new file mode 100644
index 000000000..487c3c8ad
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_4frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0ebf563ba3ed9c24b691a0f0b29e0294a1fa9b51422e1ece296155f1465768
+size 16236
diff --git a/tests/artifacts/encoded_videos/clip_5frames.mp4 b/tests/artifacts/encoded_videos/clip_5frames.mp4
new file mode 100644
index 000000000..cbbe81c39
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_5frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8475bfd5e6c4c780df46200e2b027e262b38436c57d01078bd943a5b87c65b8f
+size 20726
diff --git a/tests/artifacts/encoded_videos/clip_6frames.mp4 b/tests/artifacts/encoded_videos/clip_6frames.mp4
new file mode 100644
index 000000000..50d9badca
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_6frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6434322d1c671a7d132367619f841a775317cb9ff973f3f4505831e3ed74076d
+size 23808
diff --git a/tests/artifacts/encoded_videos/clip_h264.mp4 b/tests/artifacts/encoded_videos/clip_h264.mp4
new file mode 100644
index 000000000..90698dcf5
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_h264.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8efc84375e92a3499cef93100e04d8fb354670f3d9e0db2097b52575927284fc
+size 12237
diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py
index 6d646d4f7..80a95aa1f 100644
--- a/tests/datasets/test_aggregate.py
+++ b/tests/datasets/test_aggregate.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import logging
 from unittest.mock import patch
 
 import pytest
@@ -23,7 +25,9 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da
 import datasets  # noqa: E402
 import torch
 
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
 from lerobot.datasets.aggregate import aggregate_datasets
+from lerobot.datasets.feature_utils import features_equal_for_merge
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from tests.fixtures.constants import DUMMY_REPO_ID
 
@@ -117,8 +121,9 @@ def assert_metadata_consistency(aggr_ds, ds_0, ds_1):
         "Robot type should be the same"
     )
 
-    # Test features are the same
-    assert aggr_ds.features == ds_0.features == ds_1.features, "Features should be the same"
+    # Schema matches; merged video ``info`` is reconciled separately from per-source ``info``.
+    assert features_equal_for_merge(aggr_ds.features, ds_0.features)
+    assert features_equal_for_merge(aggr_ds.features, ds_1.features)
 
     # Test tasks aggregation
     expected_tasks = set(ds_0.meta.tasks.index) | set(ds_1.meta.tasks.index)
@@ -284,6 +289,73 @@ def test_aggregate_datasets(tmp_path, lerobot_dataset_factory):
     assert_dataset_iteration_works(aggr_ds)
 
 
+@pytest.mark.parametrize("mutation", ["mismatched_value", "missing_key"])
+def test_aggregate_incomplete_video_encoder_info_warns_and_nuls_encoders(
+    tmp_path, lerobot_dataset_factory, caplog, mutation
+):
+    """Mismatched or missing encoder ``info`` is merged per-key with fallbacks and a warning."""
+    suffix = "enc_mismatch" if mutation == "mismatched_value" else "enc_missing"
+    ds_0 = lerobot_dataset_factory(
+        root=tmp_path / f"{suffix}_a",
+        repo_id=f"{DUMMY_REPO_ID}_{suffix}_a",
+        total_episodes=2,
+        total_frames=20,
+    )
+    ds_1 = lerobot_dataset_factory(
+        root=tmp_path / f"{suffix}_b",
+        repo_id=f"{DUMMY_REPO_ID}_{suffix}_b",
+        total_episodes=2,
+        total_frames=20,
+    )
+
+    info_path = ds_1.root / "meta" / "info.json"
+    data = json.loads(info_path.read_text())
+    for ft in data["features"].values():
+        if ft.get("dtype") != "video":
+            continue
+        inf = ft.setdefault("info", {})
+        if mutation == "mismatched_value":
+            inf["video.crf"] = 99
+            inf["video.extra_options"] = {"tune": "film"}
+        else:
+            inf.pop("video.crf", None)
+            inf.pop("video.extra_options", None)
+    info_path.write_text(json.dumps(data))
+
+    aggr_id = f"{DUMMY_REPO_ID}_{suffix}_aggr"
+    aggr_root = tmp_path / f"{suffix}_aggr"
+    with caplog.at_level(logging.WARNING):
+        aggregate_datasets(
+            repo_ids=[ds_0.repo_id, ds_1.repo_id],
+            roots=[ds_0.root, ds_1.root],
+            aggr_repo_id=aggr_id,
+            aggr_root=aggr_root,
+        )
+
+    assert "heterogeneous" in caplog.text.lower() or "incomplete" in caplog.text.lower()
+
+    with (
+        patch("lerobot.datasets.dataset_metadata.get_safe_version") as mock_get_safe_version,
+        patch("lerobot.datasets.dataset_metadata.snapshot_download") as mock_snapshot_download,
+    ):
+        mock_get_safe_version.return_value = "v3.0"
+        mock_snapshot_download.return_value = str(aggr_root)
+        aggr_ds = LeRobotDataset(aggr_id, root=aggr_root)
+
+    for key, ft in aggr_ds.meta.info.features.items():
+        if ft.get("dtype") != "video":
+            continue
+        info = ft["info"]
+        reference = ds_0.meta.info.features[key]["info"]
+        for info_key in VIDEO_ENCODER_INFO_KEYS:
+            if info_key == "video.crf":
+                assert info[info_key] is None
+            elif info_key == "video.extra_options":
+                assert info[info_key] == {}
+            else:
+                assert info[info_key] == reference[info_key]
+
+
 def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory):
     """Test aggregation with small file size limits to force file rotation/sharding."""
     ds_0_num_episodes = ds_1_num_episodes = 10
diff --git a/tests/datasets/test_dataset_reader.py b/tests/datasets/test_dataset_reader.py
index bbe858b5d..085563bb8 100644
--- a/tests/datasets/test_dataset_reader.py
+++ b/tests/datasets/test_dataset_reader.py
@@ -20,7 +20,7 @@ import pytest
 pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
 
 from lerobot.datasets.dataset_reader import DatasetReader
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.utils.import_utils import get_safe_default_video_backend
 
 # ── Loading ──────────────────────────────────────────────────────────
 
@@ -35,7 +35,7 @@ def test_try_load_returns_true_when_data_exists(tmp_path, lerobot_dataset_factor
         root=dataset.root,
         episodes=None,
         tolerance_s=1e-4,
-        video_backend=get_safe_default_codec(),
+        video_backend=get_safe_default_video_backend(),
         delta_timestamps=None,
         image_transforms=None,
     )
@@ -58,7 +58,7 @@ def test_try_load_returns_false_when_no_data(tmp_path):
         root=meta.root,
         episodes=None,
         tolerance_s=1e-4,
-        video_backend=get_safe_default_codec(),
+        video_backend=get_safe_default_video_backend(),
         delta_timestamps=None,
         image_transforms=None,
     )
diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py
index 0b0862f00..032fd4f7c 100644
--- a/tests/datasets/test_dataset_tools.py
+++ b/tests/datasets/test_dataset_tools.py
@@ -23,8 +23,10 @@ import torch
 
 pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
 
+from lerobot.configs import VideoEncoderConfig
 from lerobot.datasets.dataset_tools import (
     add_features,
+    convert_image_to_video_dataset,
     delete_episodes,
     merge_datasets,
     modify_features,
@@ -32,7 +34,6 @@ from lerobot.datasets.dataset_tools import (
     remove_feature,
     split_dataset,
 )
-from lerobot.scripts.lerobot_edit_dataset import convert_image_to_video_dataset
 
 
 @pytest.fixture
@@ -1246,10 +1247,12 @@ def test_convert_image_to_video_dataset(tmp_path):
             dataset=source_dataset,
             output_dir=output_dir,
             repo_id="lerobot/pusht_video",
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
+            camera_encoder=VideoEncoderConfig(
+                vcodec="libsvtav1",
+                pix_fmt="yuv420p",
+                g=2,
+                crf=30,
+            ),
             episode_indices=[0, 1],
             num_workers=2,
         )
diff --git a/tests/datasets/test_dataset_writer.py b/tests/datasets/test_dataset_writer.py
index 8d2bc0373..8670aeebc 100644
--- a/tests/datasets/test_dataset_writer.py
+++ b/tests/datasets/test_dataset_writer.py
@@ -25,6 +25,7 @@ from PIL import Image
 
 pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
 
+from lerobot.configs import VideoEncoderConfig
 from lerobot.datasets.dataset_writer import _encode_video_worker
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.datasets.utils import DEFAULT_IMAGE_PATH
@@ -52,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
 # ── Existing encode_video_worker tests ───────────────────────────────
 
 
-def test_encode_video_worker_forwards_vcodec(tmp_path):
-    """_encode_video_worker correctly forwards the vcodec parameter."""
+def test_encode_video_worker_forwards_camera_encoder(tmp_path):
+    """_encode_video_worker forwards camera_encoder to encode_video_frames."""
     video_key = "observation.images.laptop"
     fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
     img_dir = tmp_path / Path(fpath).parent
@@ -68,13 +69,21 @@ def test_encode_video_worker_forwards_vcodec(tmp_path):
         Path(video_path).touch()
 
     with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
-        _encode_video_worker(video_key, 0, tmp_path, fps=30, vcodec="h264")
+        _encode_video_worker(
+            video_key,
+            0,
+            tmp_path,
+            fps=30,
+            camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
+            encoder_threads=4,
+        )
 
-    assert captured_kwargs["vcodec"] == "h264"
+    assert captured_kwargs["camera_encoder"].vcodec == "h264"
+    assert captured_kwargs["encoder_threads"] == 4
 
 
-def test_encode_video_worker_default_vcodec(tmp_path):
-    """_encode_video_worker uses libsvtav1 as the default codec."""
+def test_encode_video_worker_default_camera_encoder(tmp_path):
+    """_encode_video_worker passes None camera_encoder which encode_video_frames defaults."""
     video_key = "observation.images.laptop"
     fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
     img_dir = tmp_path / Path(fpath).parent
@@ -91,7 +100,8 @@ def test_encode_video_worker_default_vcodec(tmp_path):
     with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
         _encode_video_worker(video_key, 0, tmp_path, fps=30)
 
-    assert captured_kwargs["vcodec"] == "libsvtav1"
+    assert captured_kwargs["camera_encoder"] is None
+    assert captured_kwargs["encoder_threads"] is None
 
 
 # ── add_frame contracts ──────────────────────────────────────────────
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
index 654f8cdf1..ba9b64812 100644
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@@ -29,6 +29,7 @@ from PIL import Image
 from safetensors.torch import load_file
 from torchvision.transforms import v2
 
+from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig
 from lerobot.configs.default import DatasetConfig
 from lerobot.configs.train import TrainPipelineConfig
 from lerobot.datasets import make_dataset
@@ -43,7 +44,6 @@ from lerobot.datasets.utils import (
     DEFAULT_VIDEO_FILE_SIZE_IN_MB,
     create_branch,
 )
-from lerobot.datasets.video_utils import VALID_VIDEO_CODECS
 from lerobot.envs.factory import make_env_config
 from lerobot.policies.factory import make_policy_config
 from lerobot.robots import make_robot_from_config
@@ -1470,17 +1470,9 @@ def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_fact
 
 
 def test_lerobot_dataset_vcodec_validation():
-    """Test that LeRobotDataset validates the vcodec parameter."""
-    # Test that invalid vcodec raises ValueError
+    """Invalid vcodec in encoder config is rejected at construction time."""
     with pytest.raises(ValueError, match="Invalid vcodec"):
-        LeRobotDataset.__new__(LeRobotDataset)  # bypass __init__ to test validation directly
-        # Actually test via create since it's easier
-        LeRobotDataset.create(
-            repo_id="test/invalid_codec",
-            fps=30,
-            features={"observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}},
-            vcodec="invalid_codec",
-        )
+        VideoEncoderConfig(vcodec="invalid_codec")
 
 
 def test_valid_video_codecs_constant():
diff --git a/tests/datasets/test_streaming_video_encoder.py b/tests/datasets/test_streaming_video_encoder.py
index 8b7a1540f..b69f24254 100644
--- a/tests/datasets/test_streaming_video_encoder.py
+++ b/tests/datasets/test_streaming_video_encoder.py
@@ -14,11 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for streaming video encoding and hardware-accelerated encoding."""
+"""Tests for streaming video encoding."""
 
 import queue
 import threading
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -27,112 +26,20 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
 
 import av  # noqa: E402
 
+from lerobot.configs import VideoEncoderConfig
+from lerobot.datasets.pyav_utils import get_codec
 from lerobot.datasets.video_utils import (
-    VALID_VIDEO_CODECS,
     StreamingVideoEncoder,
     _CameraEncoderThread,
-    _get_codec_options,
-    detect_available_hw_encoders,
-    resolve_vcodec,
 )
 from lerobot.utils.constants import OBS_IMAGES
 
-# ─── _get_codec_options tests ───
-
-
-class TestGetCodecOptions:
-    def test_libsvtav1_defaults(self):
-        opts = _get_codec_options("libsvtav1")
-        assert opts["g"] == "2"
-        assert opts["crf"] == "30"
-        assert opts["preset"] == "12"
-
-    def test_libsvtav1_custom_preset(self):
-        opts = _get_codec_options("libsvtav1", preset=8)
-        assert opts["preset"] == "8"
-
-    def test_h264_options(self):
-        opts = _get_codec_options("h264", g=10, crf=23)
-        assert opts["g"] == "10"
-        assert opts["crf"] == "23"
-        assert "preset" not in opts
-
-    def test_videotoolbox_options(self):
-        opts = _get_codec_options("h264_videotoolbox", g=2, crf=30)
-        assert opts["g"] == "2"
-        # CRF 30 maps to quality = max(1, min(100, 100 - 30*2)) = 40
-        assert opts["q:v"] == "40"
-        assert "crf" not in opts
-
-    def test_nvenc_options(self):
-        opts = _get_codec_options("h264_nvenc", g=2, crf=25)
-        assert opts["rc"] == "constqp"
-        assert opts["qp"] == "25"
-        assert "crf" not in opts
-        # NVENC doesn't support g
-        assert "g" not in opts
-
-    def test_vaapi_options(self):
-        opts = _get_codec_options("h264_vaapi", crf=28)
-        assert opts["qp"] == "28"
-
-    def test_qsv_options(self):
-        opts = _get_codec_options("h264_qsv", crf=25)
-        assert opts["global_quality"] == "25"
-
-    def test_no_g_no_crf(self):
-        opts = _get_codec_options("h264", g=None, crf=None)
-        assert "g" not in opts
-        assert "crf" not in opts
-
-
-# ─── HW encoder detection tests ───
-
-
-class TestHWEncoderDetection:
-    def test_detect_available_hw_encoders_returns_list(self):
-        result = detect_available_hw_encoders()
-        assert isinstance(result, list)
-
-    def test_detect_available_hw_encoders_only_valid(self):
-        from lerobot.datasets.video_utils import HW_ENCODERS
-
-        result = detect_available_hw_encoders()
-        for encoder in result:
-            assert encoder in HW_ENCODERS
-
-    def test_resolve_vcodec_passthrough(self):
-        assert resolve_vcodec("libsvtav1") == "libsvtav1"
-        assert resolve_vcodec("h264") == "h264"
-
-    def test_resolve_vcodec_auto_fallback(self):
-        """When no HW encoders are available, auto should fall back to libsvtav1."""
-        with patch("lerobot.datasets.video_utils.detect_available_hw_encoders", return_value=[]):
-            assert resolve_vcodec("auto") == "libsvtav1"
-
-    def test_resolve_vcodec_auto_picks_hw(self):
-        """When a HW encoder is available, auto should pick it."""
-        with patch(
-            "lerobot.datasets.video_utils.detect_available_hw_encoders",
-            return_value=["h264_videotoolbox"],
-        ):
-            assert resolve_vcodec("auto") == "h264_videotoolbox"
-
-    def test_resolve_vcodec_auto_returns_valid(self):
-        """Test that resolve_vcodec('auto') returns a known valid codec."""
-        result = resolve_vcodec("auto")
-        assert result in VALID_VIDEO_CODECS
-
-    def test_hw_encoder_names_accepted_in_validation(self):
-        """Test that HW encoder names pass validation in VALID_VIDEO_CODECS."""
-        assert "auto" in VALID_VIDEO_CODECS
-        assert "h264_videotoolbox" in VALID_VIDEO_CODECS
-        assert "h264_nvenc" in VALID_VIDEO_CODECS
-
-    def test_resolve_vcodec_invalid_raises(self):
-        """Test that resolve_vcodec raises ValueError for invalid codecs."""
-        with pytest.raises(ValueError, match="Invalid vcodec"):
-            resolve_vcodec("not_a_real_codec")
+# Cross-codec validation tests only fire when the target codec is present
+# in the local FFmpeg build; on other platforms validate() is a no-op.
+_has_videotoolbox = get_codec("h264_videotoolbox") is not None
+_videotoolbox_only = pytest.mark.skipif(
+    not _has_videotoolbox, reason="h264_videotoolbox not in local FFmpeg build"
+)
 
 
 # ─── _CameraEncoderThread tests ───
@@ -150,14 +57,13 @@ class TestCameraEncoderThread:
         result_queue: queue.Queue = queue.Queue(maxsize=1)
         stop_event = threading.Event()
 
+        enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
         encoder_thread = _CameraEncoderThread(
             video_path=video_path,
             fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
+            vcodec=enc_cfg.vcodec,
+            pix_fmt=enc_cfg.pix_fmt,
+            codec_options=enc_cfg.get_codec_options(as_strings=True),
             frame_queue=frame_queue,
             result_queue=result_queue,
             stop_event=stop_event,
@@ -202,14 +108,13 @@ class TestCameraEncoderThread:
         result_queue: queue.Queue = queue.Queue(maxsize=1)
         stop_event = threading.Event()
 
+        enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
         encoder_thread = _CameraEncoderThread(
             video_path=video_path,
             fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
+            vcodec=enc_cfg.vcodec,
+            pix_fmt=enc_cfg.pix_fmt,
+            codec_options=enc_cfg.get_codec_options(as_strings=True),
             frame_queue=frame_queue,
             result_queue=result_queue,
             stop_event=stop_event,
@@ -237,14 +142,13 @@ class TestCameraEncoderThread:
         result_queue: queue.Queue = queue.Queue(maxsize=1)
         stop_event = threading.Event()
 
+        enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
         encoder_thread = _CameraEncoderThread(
             video_path=video_path,
             fps=fps,
-            vcodec="libsvtav1",
-            pix_fmt="yuv420p",
-            g=2,
-            crf=30,
-            preset=13,
+            vcodec=enc_cfg.vcodec,
+            pix_fmt=enc_cfg.pix_fmt,
+            codec_options=enc_cfg.get_codec_options(as_strings=True),
             frame_queue=frame_queue,
             result_queue=result_queue,
             stop_event=stop_event,
@@ -266,11 +170,20 @@ class TestCameraEncoderThread:
 
 
 class TestStreamingVideoEncoder:
+    def _make_encoder_config(self, **kwargs):
+        """Helper to build a VideoEncoderConfig."""
+        return VideoEncoderConfig(**kwargs)
+
     def test_single_camera_episode(self, tmp_path):
         """Test encoding a single camera episode."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
-
         video_keys = [f"{OBS_IMAGES}.laptop"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(
+                vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+            ),
+        )
+
         encoder.start_episode(video_keys, tmp_path)
 
         num_frames = 20
@@ -295,9 +208,11 @@ class TestStreamingVideoEncoder:
 
     def test_multi_camera_episode(self, tmp_path):
         """Test encoding multiple cameras simultaneously."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
         video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+        )
         encoder.start_episode(video_keys, tmp_path)
 
         num_frames = 15
@@ -319,8 +234,11 @@ class TestStreamingVideoEncoder:
 
     def test_sequential_episodes(self, tmp_path):
         """Test that multiple sequential episodes work correctly."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
         video_keys = [f"{OBS_IMAGES}.cam"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+        )
 
         for ep in range(3):
             encoder.start_episode(video_keys, tmp_path)
@@ -342,8 +260,11 @@ class TestStreamingVideoEncoder:
 
     def test_cancel_episode(self, tmp_path):
         """Test that canceling an episode cleans up properly."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
         video_keys = [f"{OBS_IMAGES}.cam"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+        )
 
         encoder.start_episode(video_keys, tmp_path)
 
@@ -365,28 +286,33 @@ class TestStreamingVideoEncoder:
 
     def test_feed_without_start_raises(self, tmp_path):
         """Test that feeding frames without starting an episode raises."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+        encoder = StreamingVideoEncoder(fps=30)
         with pytest.raises(RuntimeError, match="No active episode"):
             encoder.feed_frame("cam", np.zeros((64, 96, 3), dtype=np.uint8))
         encoder.close()
 
     def test_finish_without_start_raises(self, tmp_path):
         """Test that finishing without starting raises."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+        encoder = StreamingVideoEncoder(fps=30)
         with pytest.raises(RuntimeError, match="No active episode"):
             encoder.finish_episode()
         encoder.close()
 
     def test_close_is_idempotent(self, tmp_path):
         """Test that close() can be called multiple times safely."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+        encoder = StreamingVideoEncoder(fps=30)
         encoder.close()
         encoder.close()  # Should not raise
 
     def test_video_duration_matches_frame_count(self, tmp_path):
         """Test that encoded video duration matches num_frames / fps."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
         video_keys = [f"{OBS_IMAGES}.cam"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(
+                vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+            ),
+        )
         encoder.start_episode(video_keys, tmp_path)
 
         num_frames = 90  # 3 seconds at 30fps
@@ -417,9 +343,11 @@ class TestStreamingVideoEncoder:
 
     def test_multi_camera_start_episode_called_once(self, tmp_path):
         """Test that with multiple cameras, no frames are lost due to double start_episode."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
         video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+        )
         encoder.start_episode(video_keys, tmp_path)
 
         num_frames = 30
@@ -446,17 +374,24 @@ class TestStreamingVideoEncoder:
 
     def test_encoder_threads_passed_to_thread(self, tmp_path):
         """Test that encoder_threads is stored and passed through to encoder threads."""
-        encoder = StreamingVideoEncoder(
-            fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, encoder_threads=2
-        )
-        assert encoder.encoder_threads == 2
-
         video_keys = [f"{OBS_IMAGES}.cam"]
+        cfg = VideoEncoderConfig(
+            vcodec="libsvtav1",
+            pix_fmt="yuv420p",
+            g=2,
+            crf=30,
+        )
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=cfg,
+            encoder_threads=2,
+        )
+        assert encoder._encoder_threads == 2
         encoder.start_episode(video_keys, tmp_path)
 
-        # Verify the thread received the encoder_threads value
+        # Verify codec options include thread tuning for libsvtav1 (lp=…)
         thread = encoder._threads[f"{OBS_IMAGES}.cam"]
-        assert thread.encoder_threads == 2
+        assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options
 
         # Feed some frames and finish to ensure it works end-to-end
         num_frames = 10
@@ -478,16 +413,20 @@ class TestStreamingVideoEncoder:
 
     def test_encoder_threads_none_by_default(self, tmp_path):
         """Test that encoder_threads defaults to None (codec auto-detect)."""
-        encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
-        assert encoder.encoder_threads is None
+        encoder = StreamingVideoEncoder(fps=30)
+        assert encoder._encoder_threads is None
         encoder.close()
 
     def test_graceful_frame_dropping(self, tmp_path):
         """Test that full queue drops frames instead of crashing."""
-        encoder = StreamingVideoEncoder(
-            fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13, queue_maxsize=1
-        )
         video_keys = [f"{OBS_IMAGES}.cam"]
+        encoder = StreamingVideoEncoder(
+            fps=30,
+            camera_encoder=self._make_encoder_config(
+                vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+            ),
+            queue_maxsize=1,
+        )
         encoder.start_episode(video_keys, tmp_path)
 
         # Feed many frames quickly - with queue_maxsize=1, some will be dropped
diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py
new file mode 100644
index 000000000..224f2405b
--- /dev/null
+++ b/tests/datasets/test_video_encoding.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``lerobot.datasets.video_utils`` encoding functions and ``lerobot.configs.video.VideoEncoderConfig`` config class."""
+
+import json
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
+
+import av  # noqa: E402
+
+from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig
+from lerobot.datasets.image_writer import write_image
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.datasets.pyav_utils import get_codec
+from lerobot.datasets.utils import INFO_PATH
+from lerobot.datasets.video_utils import (
+    concatenate_video_files,
+    encode_video_frames,
+    get_video_info,
+)
+from tests.fixtures.constants import DUMMY_VIDEO_INFO
+
+
+# Per-codec skip markers — validation tests only fire when the codec is available
+def _require_encoder(vcodec: str) -> pytest.MarkDecorator:
+    """Skip the test if ``vcodec`` is not available in the local FFmpeg build."""
+    return pytest.mark.skipif(get_codec(vcodec) is None, reason=f"{vcodec!r} not in local FFmpeg build")
+
+
+require_libsvtav1 = _require_encoder("libsvtav1")
+require_h264 = _require_encoder("h264")
+require_videotoolbox = _require_encoder("h264_videotoolbox")
+require_nvenc = _require_encoder("h264_nvenc")
+require_vaapi = _require_encoder("h264_vaapi")
+require_qsv = _require_encoder("h264_qsv")
+
+
+# ─── VideoEncoderConfig / codec options ──────────────────────────────
+
+
+class TestCodecOptions:
+    @require_libsvtav1
+    def test_libsvtav1_defaults(self):
+        cfg = VideoEncoderConfig()
+        opts = cfg.get_codec_options()
+        assert opts["g"] == 2
+        assert opts["crf"] == 30
+        assert opts["preset"] == 12
+
+    @require_libsvtav1
+    def test_libsvtav1_custom_preset(self):
+        cfg = VideoEncoderConfig(preset=8)
+        assert cfg.get_codec_options()["preset"] == 8
+
+    @require_h264
+    def test_h264_options(self):
+        cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None)
+        opts = cfg.get_codec_options()
+        assert opts["g"] == 10
+        assert opts["crf"] == 23
+        assert "preset" not in opts
+
+    @require_videotoolbox
+    def test_videotoolbox_options(self):
+        cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None)
+        opts = cfg.get_codec_options()
+        assert opts["g"] == 2
+        assert opts["q:v"] == 40
+        assert "crf" not in opts
+
+    @_require_encoder("h264_nvenc")
+    def test_nvenc_options(self):
+        cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None)
+        opts = cfg.get_codec_options()
+        assert opts["rc"] == 0
+        assert opts["qp"] == 25
+        assert "crf" not in opts
+        assert opts["g"] == 2
+
+    @_require_encoder("h264_vaapi")
+    def test_vaapi_options(self):
+        cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None)
+        assert cfg.get_codec_options()["qp"] == 28
+
+    @_require_encoder("h264_qsv")
+    def test_qsv_options(self):
+        cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None)
+        assert cfg.get_codec_options()["global_quality"] == 25
+
+    @require_h264
+    def test_no_g_no_crf(self):
+        cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None)
+        opts = cfg.get_codec_options()
+        assert "g" not in opts
+        assert "crf" not in opts
+
+    @require_libsvtav1
+    def test_encoder_threads_libsvtav1(self):
+        cfg = VideoEncoderConfig(fast_decode=0)
+        opts = cfg.get_codec_options(encoder_threads=4)
+        assert "lp=4" in opts.get("svtav1-params", "")
+
+    @require_h264
+    def test_encoder_threads_h264(self):
+        cfg = VideoEncoderConfig(vcodec="h264", preset=None)
+        assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2
+
+    @require_libsvtav1
+    def test_fast_decode_libsvtav1(self):
+        cfg = VideoEncoderConfig(fast_decode=1)
+        opts = cfg.get_codec_options()
+        assert "fast-decode=1" in opts.get("svtav1-params", "")
+
+    @require_libsvtav1
+    def test_libsvtav1_fast_decode_clamped_to_svt_range(self):
+        """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode)."""
+        cfg = VideoEncoderConfig(fast_decode=100)
+        assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "")
+        cfg_neg = VideoEncoderConfig(fast_decode=-5)
+        assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "")
+
+    @require_h264
+    def test_fast_decode_h264(self):
+        cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None)
+        assert cfg.get_codec_options()["tune"] == "fastdecode"
+
+    @require_libsvtav1
+    def test_pix_fmt_unsupported_raises(self):
+        """Passing an unsupported pix_fmt is a hard error."""
+        with pytest.raises(ValueError, match="pix_fmt"):
+            VideoEncoderConfig(pix_fmt="yuv444p")  # libsvtav1 only supports yuv420p variants
+
+    @require_libsvtav1
+    @require_h264
+    def test_preset_default_behaviour(self):
+        """Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None."""
+        assert VideoEncoderConfig().preset == 12
+        assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12
+        assert VideoEncoderConfig(vcodec="h264").preset is None
+        assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None
+
+    @require_h264
+    def test_preset_string_on_h264(self):
+        """h264 accepts string presets and forwards them to FFmpeg."""
+        cfg = VideoEncoderConfig(vcodec="h264", preset="slow")
+        assert cfg.get_codec_options()["preset"] == "slow"
+
+    @require_videotoolbox
+    def test_preset_on_videotoolbox_not_set(self):
+        """videotoolbox has no preset option at all."""
+        cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow")
+        assert "preset" not in cfg.get_codec_options()
+
+    @require_libsvtav1
+    def test_libsvtav1_preset_out_of_range_raises(self):
+        """libsvtav1 preset must sit in [-2, 13] as exposed by PyAV."""
+        with pytest.raises(ValueError, match="out of range"):
+            VideoEncoderConfig(vcodec="libsvtav1", preset=100)
+        with pytest.raises(ValueError, match="out of range"):
+            VideoEncoderConfig(vcodec="libsvtav1", preset=-3)
+
+    @require_libsvtav1
+    def test_libsvtav1_crf_out_of_range_raises(self):
+        """libsvtav1 crf must sit in [0, 63]."""
+        with pytest.raises(ValueError, match="crf.*out of range"):
+            VideoEncoderConfig(vcodec="libsvtav1", crf=64)
+
+    @require_libsvtav1
+    def test_libsvtav1_crf_rejects_python_float(self):
+        """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation."""
+        with pytest.raises(ValueError, match="float values are not allowed"):
+            VideoEncoderConfig(vcodec="libsvtav1", crf=2.5)
+
+    @require_libsvtav1
+    def test_libsvtav1_extra_crf_rejects_fractional_string(self):
+        """INT options reject fractional values even when supplied only via ``extra_options``."""
+        with pytest.raises(ValueError, match="float values are not allowed"):
+            VideoEncoderConfig(
+                vcodec="libsvtav1",
+                crf=None,
+                extra_options={"crf": "2.5"},
+            )
+
+    @require_libsvtav1
+    def test_libsvtav1_extra_crf_rejects_float(self):
+        with pytest.raises(ValueError, match="float values are not allowed"):
+            VideoEncoderConfig(
+                vcodec="libsvtav1",
+                crf=None,
+                extra_options={"crf": 2.5},
+            )
+
+    @require_h264
+    def test_h264_crf_accepts_float_and_int(self):
+        """x264 exposes crf as a FLOAT option, so both int and float are accepted."""
+        assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23
+        assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5
+
+    @require_libsvtav1
+    def test_validate_is_rerunnable(self):
+        """After mutating a field, validate() re-checks and surfaces new issues."""
+        cfg = VideoEncoderConfig(vcodec="libsvtav1")
+        cfg.preset = 100  # now out of range
+        with pytest.raises(ValueError, match="out of range"):
+            cfg.validate()
+
+
+class TestExtraOptions:
+    @require_libsvtav1
+    def test_default_is_empty_dict(self):
+        cfg = VideoEncoderConfig()
+        assert cfg.extra_options == {}
+
+    @require_libsvtav1
+    def test_unknown_key_passes_through(self):
+        """Keys not published as AVOptions are forwarded to FFmpeg."""
+        cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"})
+        assert cfg.extra_options == {"totally_made_up_option": "value"}
+
+    @require_libsvtav1
+    def test_numeric_value_in_range_ok(self):
+        """libsvtav1 exposes ``qp`` as INT in [0, 63]."""
+        cfg = VideoEncoderConfig(extra_options={"qp": 30})
+        assert cfg.extra_options == {"qp": 30}
+
+    @require_libsvtav1
+    def test_numeric_out_of_range_raises(self):
+        with pytest.raises(ValueError, match=r"qp=.*out of range"):
+            VideoEncoderConfig(extra_options={"qp": 999})
+
+    @require_libsvtav1
+    def test_numeric_string_accepted_in_range(self):
+        """Numeric strings are accepted for numeric options (mirrors FFmpeg)."""
+        cfg = VideoEncoderConfig(extra_options={"qp": "18"})
+        assert cfg.extra_options == {"qp": "18"}
+
+    @require_libsvtav1
+    def test_numeric_string_out_of_range_raises(self):
+        with pytest.raises(ValueError, match=r"qp=.*out of range"):
+            VideoEncoderConfig(extra_options={"qp": "999"})
+
+    @require_libsvtav1
+    def test_non_numeric_string_on_numeric_option_raises(self):
+        with pytest.raises(ValueError, match=r"qp=.*not numeric"):
+            VideoEncoderConfig(extra_options={"qp": "medium"})
+
+    @require_libsvtav1
+    def test_bool_on_numeric_option_raises(self):
+        """``bool`` is explicitly rejected for numeric options."""
+        with pytest.raises(ValueError, match=r"qp=.*not numeric"):
+            VideoEncoderConfig(extra_options={"qp": True})
+
+    @require_h264
+    def test_string_option_passes_through_unchecked(self):
+        """String-typed AVOptions are NOT enum-checked (too many accept freeform)."""
+        cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"})
+        assert cfg.extra_options == {"tune": "some-future-tune"}
+
+    @require_libsvtav1
+    def test_merged_into_codec_options_and_stringified(self):
+        """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict."""
+        cfg = VideoEncoderConfig(extra_options={"qp": 20})
+        opts = cfg.get_codec_options()
+        assert opts["qp"] == 20
+        assert isinstance(opts["qp"], int)
+        assert cfg.get_codec_options(as_strings=True)["qp"] == "20"
+
+    @require_libsvtav1
+    def test_structured_fields_win_on_collision(self):
+        """A colliding extra_options key is discarded; the structured field wins."""
+        cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18})
+        assert cfg.get_codec_options()["crf"] == 30
+
+
+class TestEncoderDetection:
+    @require_h264
+    def test_explicit_codec_kept_when_available(self):
+        cfg = VideoEncoderConfig(vcodec="h264")
+        assert cfg.vcodec == "h264"
+
+    @require_videotoolbox
+    def test_auto_picks_videotoolbox_when_available(self):
+        """``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present."""
+        cfg = VideoEncoderConfig(vcodec="auto")
+        assert cfg.vcodec == "h264_videotoolbox"
+
+    def test_invalid_codec_raises(self):
+        with pytest.raises(ValueError, match="Invalid vcodec"):
+            VideoEncoderConfig(vcodec="not_a_real_codec")
+
+    def test_hw_encoder_names_listed_as_valid(self):
+        assert "auto" in VALID_VIDEO_CODECS
+        assert "h264_videotoolbox" in VALID_VIDEO_CODECS
+        assert "h264_nvenc" in VALID_VIDEO_CODECS
+
+
+TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"
+
+# Default video feature set used by persistence tests.
+VIDEO_FEATURES = {
+    "observation.images.cam": {
+        "dtype": "video",
+        "shape": (64, 96, 3),
+        "names": ["height", "width", "channels"],
+    },
+    "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
+}
+VIDEO_KEY = "observation.images.cam"
+
+
+def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
+    imgs_dir.mkdir(parents=True, exist_ok=True)
+    for i in range(num_frames):
+        arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        write_image(arr, imgs_dir / f"frame-{i:06d}.png")
+
+
+def _encode_video(
+    path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None
+) -> Path:
+    imgs_dir = path.parent / f"imgs_{path.stem}"
+    _write_frames(imgs_dir, num_frames=num_frames)
+    encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True)
+    return path
+
+
+def _read_feature_info(dataset: LeRobotDataset) -> dict:
+    info = json.loads((dataset.root / INFO_PATH).read_text())
+    return info["features"][VIDEO_KEY]["info"]
+
+
+def _add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
+    shape = dataset.meta.features[VIDEO_KEY]["shape"]
+    for _ in range(num_frames):
+        dataset.add_frame(
+            {
+                VIDEO_KEY: np.random.randint(0, 256, shape, dtype=np.uint8),
+                "action": np.zeros(2, dtype=np.float32),
+                "task": "test",
+            }
+        )
+
+
+class TestGetVideoInfo:
+    def test_returns_all_stream_fields(self):
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4")
+
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.pix_fmt"] == "yuv420p"
+        assert info["video.fps"] == 30
+        assert info["video.channels"] == 3
+        assert info["video.is_depth_map"] is False
+        assert info["has_audio"] is False
+        assert "video.g" not in info
+        assert "video.crf" not in info
+        assert "video.preset" not in info
+
+    @require_libsvtav1
+    def test_merges_encoder_config_as_video_prefixed_entries(self):
+        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+
+        assert info["video.g"] == 2
+        assert info["video.crf"] == 30
+        assert info["video.preset"] == 12
+        assert info["video.fast_decode"] == 0
+        assert info["video.video_backend"] == "pyav"
+        assert info["video.extra_options"] == {}
+
+    @require_libsvtav1
+    def test_stream_derived_keys_take_precedence_over_config(self):
+        cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p")
+
+        info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+
+        assert info["video.codec"]  # populated from stream, not from config's vcodec
+        assert info["video.pix_fmt"] == "yuv420p"
+
+
+class TestEncodeVideoFrames:
+    @require_libsvtav1
+    def test_produces_readable_mp4(self, tmp_path):
+        video_path = _encode_video(tmp_path / "out.mp4")
+
+        assert video_path.exists()
+        info = get_video_info(video_path)
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+
+    @require_libsvtav1
+    def test_frame_count_and_duration_match_input(self, tmp_path):
+        num_frames = 10
+        fps = 30
+        video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps)
+
+        with av.open(str(video_path)) as container:
+            stream = container.streams.video[0]
+            actual_frames = sum(1 for _ in container.decode(stream))
+            duration = (
+                float(stream.duration * stream.time_base)
+                if stream.duration is not None
+                else float(container.duration / av.time_base)
+            )
+
+        assert actual_frames == num_frames
+        assert abs(duration - num_frames / fps) < 0.1
+
+    def test_overwrite_false_skips_existing_file(self, tmp_path):
+        imgs_dir = tmp_path / "imgs"
+        _write_frames(imgs_dir)
+        video_path = tmp_path / "out.mp4"
+        sentinel = b"pre-existing content"
+        video_path.write_bytes(sentinel)
+
+        encode_video_frames(imgs_dir, video_path, fps=30, overwrite=False)
+
+        assert video_path.read_bytes() == sentinel
+
+    @require_libsvtav1
+    def test_overwrite_true_replaces_existing_file(self, tmp_path):
+        imgs_dir = tmp_path / "imgs"
+        _write_frames(imgs_dir)
+        video_path = tmp_path / "out.mp4"
+        video_path.write_bytes(b"stale content")
+
+        encode_video_frames(imgs_dir, video_path, fps=30, overwrite=True)
+
+        info = get_video_info(video_path)
+        assert info["video.height"] == 64
+
+    @require_libsvtav1
+    def test_custom_encoder_config_fields_stored_in_info(self, tmp_path):
+        """All stream-derived and encoder config fields are present after encoding."""
+        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10)
+        video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg)
+
+        info = get_video_info(video_path, camera_encoder=cfg)
+
+        # Stream-derived
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.channels"] == 3
+        assert info["video.codec"] == "av1"
+        assert info["video.pix_fmt"] == "yuv420p"
+        assert info["video.fps"] == 30
+        assert info["video.is_depth_map"] is False
+        assert info["has_audio"] is False
+        # Encoder config
+        assert info["video.g"] == 4
+        assert info["video.crf"] == 25
+        assert info["video.preset"] == 10
+        assert info["video.fast_decode"] == 0
+        assert info["video.video_backend"] == "pyav"
+        assert info["video.extra_options"] == {}
+
+
+class TestConcatenateVideoFiles:
+    def test_two_clips_frame_count(self, tmp_path):
+        """Output frame count equals the sum of the two input frame counts."""
+        out = tmp_path / "out.mp4"
+        concatenate_video_files(
+            [TEST_ARTIFACTS_DIR / "clip_6frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
+        )
+
+        with av.open(str(out)) as container:
+            total = sum(1 for _ in container.decode(video=0))
+        assert total == 10
+
+    def test_three_clips_frame_count(self, tmp_path):
+        out = tmp_path / "out.mp4"
+        clip = TEST_ARTIFACTS_DIR / "clip_5frames.mp4"
+        concatenate_video_files([clip, clip, clip], out)
+
+        with av.open(str(out)) as container:
+            total = sum(1 for _ in container.decode(video=0))
+        assert total == 15
+
+    @require_libsvtav1
+    def test_geometry_preserved(self, tmp_path):
+        """Output resolution, fps, codec and pixel format must match the inputs."""
+        out = tmp_path / "out.mp4"
+        concatenate_video_files(
+            [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
+        )
+
+        info = get_video_info(out)
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.fps"] == 30
+        assert info["video.codec"] == "av1"
+        assert info["video.pix_fmt"] == "yuv420p"
+
+    def test_compatibility_check_raises_on_different_codec(self, tmp_path):
+        with pytest.raises(ValueError):
+            concatenate_video_files(
+                [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_h264.mp4"],
+                tmp_path / "out.mp4",
+                compatibility_check=True,
+            )
+
+    def test_compatibility_check_raises_on_different_resolution(self, tmp_path):
+        with pytest.raises(ValueError):
+            concatenate_video_files(
+                [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_32x48.mp4"],
+                tmp_path / "out.mp4",
+                compatibility_check=True,
+            )
+
+
+class TestEncoderConfigPersistence:
+    """Encoder config must be stored as ``video.<field>`` entries in
+    ``info["features"][key]["info"]`` when the first episode is saved.
+    """
+
+    @require_libsvtav1
+    def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
+        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+        dataset = empty_lerobot_dataset_factory(
+            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+        )
+
+        _add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        dataset.finalize()
+
+        info = _read_feature_info(dataset)
+
+        assert info["video.height"] == 64
+        assert info["video.width"] == 96
+        assert info["video.fps"] == 30
+        assert info["video.g"] == 2
+        assert info["video.crf"] == 30
+        assert info["video.preset"] == 12
+        assert info["video.fast_decode"] == 0
+        assert info["video.video_backend"] == "pyav"
+        assert info["video.extra_options"] == {}
+
+    @require_libsvtav1
+    def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory):
+        cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+        dataset = empty_lerobot_dataset_factory(
+            root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+        )
+
+        _add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        first_info = dict(_read_feature_info(dataset))
+
+        _add_frames(dataset, num_frames=4)
+        dataset.save_episode()
+        dataset.finalize()
+
+        assert _read_feature_info(dataset) == first_info
+
+
+class TestFromVideoInfo:
+    """``VideoEncoderConfig.from_video_info`` reconstructs an encoder config
+    from the ``video.*`` keys persisted in a dataset's ``info.json``.
+    """
+
+    @require_libsvtav1
+    def test_reconstructs_from_dummy_video_info(self):
+        cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO)
+
+        # Canonical stream codec ``"av1"`` is aliased to the encoder name.
+        assert cfg.vcodec == "libsvtav1"
+        assert cfg.pix_fmt == DUMMY_VIDEO_INFO["video.pix_fmt"]
+        assert cfg.g == DUMMY_VIDEO_INFO["video.g"]
+        assert cfg.crf == DUMMY_VIDEO_INFO["video.crf"]
+        assert cfg.preset == DUMMY_VIDEO_INFO["video.preset"]
+        assert cfg.fast_decode == DUMMY_VIDEO_INFO["video.fast_decode"]
+        assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"]
+        # ``{}`` placeholder (typical after a merge with disagreeing sources)
+        # must not leak into the reconstructed config.
+        assert cfg.extra_options == VideoEncoderConfig().extra_options
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index 35d8776ce..4d578b503 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -28,17 +28,23 @@ DUMMY_MOTOR_FEATURES = {
         "names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"],
     },
 }
-DUMMY_CAMERA_FEATURES = {
-    "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
-    "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
-}
 DEFAULT_FPS = 30
 DUMMY_VIDEO_INFO = {
     "video.fps": DEFAULT_FPS,
     "video.codec": "av1",
     "video.pix_fmt": "yuv420p",
+    "video.video_backend": "pyav",
+    "video.extra_options": {},
+    "video.g": 2,
+    "video.crf": 30,
+    "video.preset": 12,
+    "video.fast_decode": 0,
     "video.is_depth_map": False,
     "has_audio": False,
 }
+DUMMY_CAMERA_FEATURES = {
+    "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
+    "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
+}
 DUMMY_CHW = (3, 96, 128)
 DUMMY_HWC = (96, 128, 3)
diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py
index 48128a8d0..a6e349778 100644
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -46,7 +46,6 @@ from tests.fixtures.constants import (
     DUMMY_MOTOR_FEATURES,
     DUMMY_REPO_ID,
     DUMMY_ROBOT_TYPE,
-    DUMMY_VIDEO_INFO,
 )
 
 
@@ -134,9 +133,7 @@ def features_factory():
         use_videos: bool = True,
     ) -> dict:
         if use_videos:
-            camera_ft = {
-                key: {"dtype": "video", **ft, **DUMMY_VIDEO_INFO} for key, ft in camera_features.items()
-            }
+            camera_ft = {key: {"dtype": "video", **ft} for key, ft in camera_features.items()}
         else:
             camera_ft = {key: {"dtype": "image", **ft} for key, ft in camera_features.items()}
         return {