From f72b28738a1c4836fd90d1892e82e61da4476f0b Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 16:40:29 +0200
Subject: [PATCH] fix(annotate): default keyframe decode to ffmpeg CLI
 (thread-safe)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The decoder chain tried torchcodec first, then ffmpeg. torchcodec is
not thread-safe: under the executor's 16-wide concurrent decode in the
interjections phase it SIGSEGVs (exit 139) before the ffmpeg fallback
is ever reached — uncatchable, so it kills the whole job.

Default the auto chain to ffmpeg only. Per-frame ffmpeg decode runs in
an isolated child process: crash-safe and concurrency-safe (the plan
phase already proved 16 parallel ffmpeg subprocesses are fine).
torchcodec / pyav remain available via an explicit video_backend.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 11 ++++---
 .../annotations/steerable_pipeline/frames.py  | 33 +++++++++----------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 106300a4b..dd439f9b9 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -189,11 +189,12 @@ class AnnotationPipelineConfig:
     skip_validation: bool = False
     only_episodes: tuple[int, ...] | None = None
 
-    # Keyframe decode backend. When unset, decoding tries the platform default
-    # (torchcodec when installed) then falls back to the ffmpeg CLI, which
-    # decodes AV1 and isolates crashes to a child process. Set to one of
-    # ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` to pin a single backend —
-    # e.g. ``"ffmpeg"`` in containers where torchcodec cannot decode.
+    # Keyframe decode backend. When unset, the pipeline decodes with the
+    # ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child
+    # process, which is both crash-safe and safe under the concurrent
+    # decode the executor performs (torchcodec is not thread-safe and
+    # SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an
+    # in-process decoder when its build is known thread-safe.
     video_backend: str | None = None
 
     # When True, upload the annotated dataset to the Hugging Face Hub:
diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py
index c51ec21f0..112f50ce6 100644
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -34,7 +34,6 @@ import PIL.Image
 import torch
 
 from lerobot.datasets.video_utils import decode_video_frames
-from lerobot.utils.import_utils import get_safe_default_codec
 
 from .reader import EpisodeRecord
 
@@ -135,11 +134,10 @@ class VideoFrameProvider:
     camera_key: str | None = None
     tolerance_s: float = 1e-2
     cache_size: int = 256
-    # Keyframe decode backend. When ``None``, decoding tries the platform
-    # default (torchcodec when installed) then falls back to the ffmpeg CLI.
-    # Set explicitly to one of ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"``
-    # to pin a single backend — e.g. ``"ffmpeg"`` to skip a torchcodec that
-    # cannot decode the dataset's codec ("Operation not permitted").
+    # Keyframe decode backend. ``None`` uses the ffmpeg CLI — the
+    # concurrency- and crash-safe default for the pipeline's threaded
+    # decode. Set to ``"torchcodec"`` or ``"pyav"`` to pin an in-process
+    # decoder when the build is known thread-safe.
     video_backend: str | None = None
     _meta: Any = field(default=None, init=False, repr=False)
     _cache: dict = field(default_factory=dict, init=False, repr=False)
@@ -303,18 +301,17 @@ class VideoFrameProvider:
         shifted = [from_timestamp + ts for ts in timestamps]
         video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
 
-        # Build the decoder chain. In-process decoders are fragile here:
-        # torchcodec raises in some containers (vllm-openai: "Operation not
-        # permitted"), lerobot's ``pyav`` backend routes through
-        # ``torchvision.io.VideoReader`` (removed in torchvision 0.23+), and
-        # PyAV can outright SIGSEGV on the AV1 streams modern LeRobot
-        # datasets use. ``_decode_frames_ffmpeg`` shells out to the ffmpeg
-        # CLI — it decodes AV1 and a crash stays isolated to the child
-        # process — so it is the always-available fallback.
-        if self.video_backend:
-            chain = [self.video_backend]
-        else:
-            chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["ffmpeg"]
+        # Default to the ffmpeg CLI. The pipeline decodes under a 16-wide
+        # ThreadPoolExecutor and the in-process decoders are unsafe there:
+        # torchcodec is not thread-safe and SIGSEGVs under concurrent decode
+        # (a crash no try/except can catch), PyAV can likewise segfault on
+        # AV1, and lerobot's ``pyav`` backend routes through the removed
+        # ``torchvision.io.VideoReader``. ``_decode_frames_ffmpeg`` shells
+        # out per frame: each decode is an isolated child process, so it is
+        # both crash-safe and concurrency-safe. ``video_backend`` can pin
+        # ``torchcodec`` / ``pyav`` explicitly for callers that know their
+        # build is safe.
+        chain = [self.video_backend] if self.video_backend else ["ffmpeg"]
 
         exc: Exception | None = None
         for backend in chain: