fix(annotate): default keyframe decode to ffmpeg CLI (thread-safe)

The decoder chain tried torchcodec first, then ffmpeg. torchcodec is
not thread-safe: under the executor's 16-wide concurrent decode in the
interjections phase it SIGSEGVs (exit 139) before the ffmpeg fallback
is ever reached — uncatchable, so it kills the whole job.

Default the auto chain to ffmpeg only. Per-frame ffmpeg decode runs in
an isolated child process: crash-safe and concurrency-safe (the plan
phase already proved 16 parallel ffmpeg subprocesses are fine).
torchcodec / pyav remain available via an explicit video_backend.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-18 16:40:29 +02:00
parent 1bd53cc7da
commit f72b28738a
2 changed files with 21 additions and 23 deletions
@@ -189,11 +189,12 @@ class AnnotationPipelineConfig:
skip_validation: bool = False
only_episodes: tuple[int, ...] | None = None
# Keyframe decode backend. When unset, decoding tries the platform default
# (torchcodec when installed) then falls back to the ffmpeg CLI, which
# decodes AV1 and isolates crashes to a child process. Set to one of
# ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` to pin a single backend —
# e.g. ``"ffmpeg"`` in containers where torchcodec cannot decode.
# Keyframe decode backend. When unset, the pipeline decodes with the
# ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child
# process, which is both crash-safe and safe under the concurrent
# decode the executor performs (torchcodec is not thread-safe and
# SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an
# in-process decoder when its build is known thread-safe.
video_backend: str | None = None
# When True, upload the annotated dataset to the Hugging Face Hub:
@@ -34,7 +34,6 @@ import PIL.Image
import torch
from lerobot.datasets.video_utils import decode_video_frames
from lerobot.utils.import_utils import get_safe_default_codec
from .reader import EpisodeRecord
@@ -135,11 +134,10 @@ class VideoFrameProvider:
camera_key: str | None = None
tolerance_s: float = 1e-2
cache_size: int = 256
# Keyframe decode backend. When ``None``, decoding tries the platform
# default (torchcodec when installed) then falls back to the ffmpeg CLI.
# Set explicitly to one of ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"``
# to pin a single backend — e.g. ``"ffmpeg"`` to skip a torchcodec that
# cannot decode the dataset's codec ("Operation not permitted").
# Keyframe decode backend. ``None`` uses the ffmpeg CLI — the
# concurrency- and crash-safe default for the pipeline's threaded
# decode. Set to ``"torchcodec"`` or ``"pyav"`` to pin an in-process
# decoder when the build is known thread-safe.
video_backend: str | None = None
_meta: Any = field(default=None, init=False, repr=False)
_cache: dict = field(default_factory=dict, init=False, repr=False)
@@ -303,18 +301,17 @@ class VideoFrameProvider:
shifted = [from_timestamp + ts for ts in timestamps]
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
# Build the decoder chain. In-process decoders are fragile here:
# torchcodec raises in some containers (vllm-openai: "Operation not
# permitted"), lerobot's ``pyav`` backend routes through
# ``torchvision.io.VideoReader`` (removed in torchvision 0.23+), and
# PyAV can outright SIGSEGV on the AV1 streams modern LeRobot
# datasets use. ``_decode_frames_ffmpeg`` shells out to the ffmpeg
# CLI — it decodes AV1 and a crash stays isolated to the child
# process — so it is the always-available fallback.
if self.video_backend:
chain = [self.video_backend]
else:
chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["ffmpeg"]
# Default to the ffmpeg CLI. The pipeline decodes under a 16-wide
# ThreadPoolExecutor and the in-process decoders are unsafe there:
# torchcodec is not thread-safe and SIGSEGVs under concurrent decode
# (a crash no try/except can catch), PyAV can likewise segfault on
# AV1, and lerobot's ``pyav`` backend routes through the removed
# ``torchvision.io.VideoReader``. ``_decode_frames_ffmpeg`` shells
# out per frame: each decode is an isolated child process, so it is
# both crash-safe and concurrency-safe. ``video_backend`` can pin
# ``torchcodec`` / ``pyav`` explicitly for callers that know their
# build is safe.
chain = [self.video_backend] if self.video_backend else ["ffmpeg"]
exc: Exception | None = None
for backend in chain: