fix(annotate): default keyframe decode to ffmpeg CLI (thread-safe)

The decoder chain tried torchcodec first, then ffmpeg. torchcodec is not thread-safe: under the executor's 16-wide concurrent decode in the interjections phase it SIGSEGVs (exit 139) before the ffmpeg fallback is ever reached — uncatchable, so it kills the whole job. Default the auto chain to ffmpeg only. Per-frame ffmpeg decode runs in an isolated child process: crash-safe and concurrency-safe (the plan phase already proved 16 parallel ffmpeg subprocesses are fine). torchcodec / pyav remain available via an explicit video_backend. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 02:29:47 +00:00 · 2026-05-18 16:40:29 +02:00
parent 1bd53cc7da
commit f72b28738a
2 changed files with 21 additions and 23 deletions
@@ -189,11 +189,12 @@ class AnnotationPipelineConfig:
    skip_validation: bool = False
    only_episodes: tuple[int, ...] | None = None

-    # Keyframe decode backend. When unset, decoding tries the platform default
-    # (torchcodec when installed) then falls back to the ffmpeg CLI, which
-    # decodes AV1 and isolates crashes to a child process. Set to one of
-    # ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` to pin a single backend —
-    # e.g. ``"ffmpeg"`` in containers where torchcodec cannot decode.
+    # Keyframe decode backend. When unset, the pipeline decodes with the
+    # ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child
+    # process, which is both crash-safe and safe under the concurrent
+    # decode the executor performs (torchcodec is not thread-safe and
+    # SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an
+    # in-process decoder when its build is known thread-safe.
    video_backend: str | None = None

    # When True, upload the annotated dataset to the Hugging Face Hub:
@@ -34,7 +34,6 @@ import PIL.Image
 import torch

 from lerobot.datasets.video_utils import decode_video_frames
-from lerobot.utils.import_utils import get_safe_default_codec

 from .reader import EpisodeRecord

@@ -135,11 +134,10 @@ class VideoFrameProvider:
    camera_key: str | None = None
    tolerance_s: float = 1e-2
    cache_size: int = 256
-    # Keyframe decode backend. When ``None``, decoding tries the platform
-    # default (torchcodec when installed) then falls back to the ffmpeg CLI.
-    # Set explicitly to one of ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"``
-    # to pin a single backend — e.g. ``"ffmpeg"`` to skip a torchcodec that
-    # cannot decode the dataset's codec ("Operation not permitted").
+    # Keyframe decode backend. ``None`` uses the ffmpeg CLI — the
+    # concurrency- and crash-safe default for the pipeline's threaded
+    # decode. Set to ``"torchcodec"`` or ``"pyav"`` to pin an in-process
+    # decoder when the build is known thread-safe.
    video_backend: str | None = None
    _meta: Any = field(default=None, init=False, repr=False)
    _cache: dict = field(default_factory=dict, init=False, repr=False)
@@ -303,18 +301,17 @@ class VideoFrameProvider:
        shifted = [from_timestamp + ts for ts in timestamps]
        video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)

-        # Build the decoder chain. In-process decoders are fragile here:
-        # torchcodec raises in some containers (vllm-openai: "Operation not
-        # permitted"), lerobot's ``pyav`` backend routes through
-        # ``torchvision.io.VideoReader`` (removed in torchvision 0.23+), and
-        # PyAV can outright SIGSEGV on the AV1 streams modern LeRobot
-        # datasets use. ``_decode_frames_ffmpeg`` shells out to the ffmpeg
-        # CLI — it decodes AV1 and a crash stays isolated to the child
-        # process — so it is the always-available fallback.
-        if self.video_backend:
-            chain = [self.video_backend]
-        else:
-            chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["ffmpeg"]
+        # Default to the ffmpeg CLI. The pipeline decodes under a 16-wide
+        # ThreadPoolExecutor and the in-process decoders are unsafe there:
+        # torchcodec is not thread-safe and SIGSEGVs under concurrent decode
+        # (a crash no try/except can catch), PyAV can likewise segfault on
+        # AV1, and lerobot's ``pyav`` backend routes through the removed
+        # ``torchvision.io.VideoReader``. ``_decode_frames_ffmpeg`` shells
+        # out per frame: each decode is an isolated child process, so it is
+        # both crash-safe and concurrency-safe. ``video_backend`` can pin
+        # ``torchcodec`` / ``pyav`` explicitly for callers that know their
+        # build is safe.
+        chain = [self.video_backend] if self.video_backend else ["ffmpeg"]

        exc: Exception | None = None
        for backend in chain: