feat(dataset): add streaming video encoding + HW encoder support (#2974)

* feat(dataset): init stream encoding * feat(dataset): use threads to fix frame pickle latency * refactor(dataset): remove HW encoded related changes * add lp (#2977) * feat(dataset): add Hw encoding + log drop frames (#2978) * chore(docs): add streaming video encoding guide * fix(dataset): style docs + testing * chore(docs): simplify sttreaming video encoding guide * chore(dataset): add commands + streaming encoding default false + print note if false + queue default is now 30 * chore(docs): add verification note advice * chore(dataset): adjusting defaults & docs for streaming encoding * docs(scripts): improve docstrings * test(dataset): polish streaming encoding tests * chore(dataset): move FYI log related to streaming * chore(dataset): add arg vcodec to suggestions * refactor(dataset): better handling for auto and available vcodec * chore(dataset): change log level * docs(dataset): add note related to training performance vcodec * docs(dataset): add more notes to streaming encoding --------- Co-authored-by: Caroline Pascal <caroline8.pascal@gmail.com> Co-authored-by: Pepijn <pepijn@huggingface.co>
2026-05-22 03:59:42 +00:00 · 2026-02-23 13:57:43 +01:00
parent 5865170d36
commit e96339a3b4
16 changed files with 1532 additions and 55 deletions
@@ -26,8 +26,10 @@ lerobot-record \
    --dataset.repo_id=<my_username>/<my_dataset_name> \
    --dataset.num_episodes=2 \
    --dataset.single_task="Grab the cube" \
+    --dataset.streaming_encoding=true \
+    --dataset.encoder_threads=2 \
    --display_data=true
-    # <- Optional: specify video codec (h264, hevc, libsvtav1). Default is libsvtav1. \
+    # <- Optional: specify video codec (auto, h264, hevc, libsvtav1). Default is libsvtav1. \
    # --dataset.vcodec=h264 \
    # <- Teleop optional if you want to teleoperate to record or in between episodes with a policy \
    # --teleop.type=so100_leader \
@@ -58,7 +60,10 @@ lerobot-record \
  --display_data=true \
  --dataset.repo_id=${HF_USER}/bimanual-so-handover-cube \
  --dataset.num_episodes=25 \
-  --dataset.single_task="Grab and handover the red cube to the other arm"
+  --dataset.single_task="Grab and handover the red cube to the other arm" \
+  --dataset.streaming_encoding=true \
+  # --dataset.vcodec=auto \
+  --dataset.encoder_threads=2
 ```
 """

@@ -179,9 +184,19 @@ class DatasetRecordConfig:
    # Number of episodes to record before batch encoding videos
    # Set to 1 for immediate encoding (default behavior), or higher for batched encoding
    video_encoding_batch_size: int = 1
-    # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1'.
-    # Use 'h264' for faster encoding on systems where AV1 encoding is CPU-heavy.
+    # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
+    # or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
+    # Use 'auto' to auto-detect the best available hardware encoder.
    vcodec: str = "libsvtav1"
+    # Enable streaming video encoding: encode frames in real-time during capture instead
+    # of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
+    streaming_encoding: bool = False
+    # Maximum number of frames to buffer per camera when using streaming encoding.
+    # ~1s buffer at 30fps. Provides backpressure if the encoder can't keep up.
+    encoder_queue_maxsize: int = 30
+    # Number of threads per encoder instance. None = auto (codec default).
+    # Lower values reduce CPU usage, maps to 'lp' (via svtav1-params) for libsvtav1 and 'threads' for h264/hevc..
+    encoder_threads: int | None = None
    # Rename map for the observation to override the image and state keys
    rename_map: dict[str, str] = field(default_factory=dict)

@@ -452,6 +467,9 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
                root=cfg.dataset.root,
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                vcodec=cfg.dataset.vcodec,
+                streaming_encoding=cfg.dataset.streaming_encoding,
+                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
+                encoder_threads=cfg.dataset.encoder_threads,
            )

            if hasattr(robot, "cameras") and len(robot.cameras) > 0:
@@ -474,6 +492,9 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
                image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
                batch_encoding_size=cfg.dataset.video_encoding_batch_size,
                vcodec=cfg.dataset.vcodec,
+                streaming_encoding=cfg.dataset.streaming_encoding,
+                encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
+                encoder_threads=cfg.dataset.encoder_threads,
            )

        # Load pretrained policy
@@ -497,6 +518,11 @@ def record(cfg: RecordConfig) -> LeRobotDataset:

        listener, events = init_keyboard_listener()

+        if not cfg.dataset.streaming_encoding:
+            logging.info(
+                "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
+            )
+
        with VideoEncodingManager(dataset):
            recorded_episodes = 0
            while recorded_episodes < cfg.dataset.num_episodes and not events["stop_recording"]: