From f72b28738a1c4836fd90d1892e82e61da4476f0b Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 16:40:29 +0200 Subject: [PATCH] fix(annotate): default keyframe decode to ffmpeg CLI (thread-safe) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decoder chain tried torchcodec first, then ffmpeg. torchcodec is not thread-safe: under the executor's 16-wide concurrent decode in the interjections phase it SIGSEGVs (exit 139) before the ffmpeg fallback is ever reached — uncatchable, so it kills the whole job. Default the auto chain to ffmpeg only. Per-frame ffmpeg decode runs in an isolated child process: crash-safe and concurrency-safe (the plan phase already proved 16 parallel ffmpeg subprocesses are fine). torchcodec / pyav remain available via an explicit video_backend. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 11 ++++--- .../annotations/steerable_pipeline/frames.py | 33 +++++++++---------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 106300a4b..dd439f9b9 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -189,11 +189,12 @@ class AnnotationPipelineConfig: skip_validation: bool = False only_episodes: tuple[int, ...] | None = None - # Keyframe decode backend. When unset, decoding tries the platform default - # (torchcodec when installed) then falls back to the ffmpeg CLI, which - # decodes AV1 and isolates crashes to a child process. Set to one of - # ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` to pin a single backend — - # e.g. ``"ffmpeg"`` in containers where torchcodec cannot decode. + # Keyframe decode backend. When unset, the pipeline decodes with the + # ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child + # process, which is both crash-safe and safe under the concurrent + # decode the executor performs (torchcodec is not thread-safe and + # SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an + # in-process decoder when its build is known thread-safe. video_backend: str | None = None # When True, upload the annotated dataset to the Hugging Face Hub: diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py index c51ec21f0..112f50ce6 100644 --- a/src/lerobot/annotations/steerable_pipeline/frames.py +++ b/src/lerobot/annotations/steerable_pipeline/frames.py @@ -34,7 +34,6 @@ import PIL.Image import torch from lerobot.datasets.video_utils import decode_video_frames -from lerobot.utils.import_utils import get_safe_default_codec from .reader import EpisodeRecord @@ -135,11 +134,10 @@ class VideoFrameProvider: camera_key: str | None = None tolerance_s: float = 1e-2 cache_size: int = 256 - # Keyframe decode backend. When ``None``, decoding tries the platform - # default (torchcodec when installed) then falls back to the ffmpeg CLI. - # Set explicitly to one of ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` - # to pin a single backend — e.g. ``"ffmpeg"`` to skip a torchcodec that - # cannot decode the dataset's codec ("Operation not permitted"). + # Keyframe decode backend. ``None`` uses the ffmpeg CLI — the + # concurrency- and crash-safe default for the pipeline's threaded + # decode. Set to ``"torchcodec"`` or ``"pyav"`` to pin an in-process + # decoder when the build is known thread-safe. video_backend: str | None = None _meta: Any = field(default=None, init=False, repr=False) _cache: dict = field(default_factory=dict, init=False, repr=False) @@ -303,18 +301,17 @@ class VideoFrameProvider: shifted = [from_timestamp + ts for ts in timestamps] video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key) - # Build the decoder chain. In-process decoders are fragile here: - # torchcodec raises in some containers (vllm-openai: "Operation not - # permitted"), lerobot's ``pyav`` backend routes through - # ``torchvision.io.VideoReader`` (removed in torchvision 0.23+), and - # PyAV can outright SIGSEGV on the AV1 streams modern LeRobot - # datasets use. ``_decode_frames_ffmpeg`` shells out to the ffmpeg - # CLI — it decodes AV1 and a crash stays isolated to the child - # process — so it is the always-available fallback. - if self.video_backend: - chain = [self.video_backend] - else: - chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["ffmpeg"] + # Default to the ffmpeg CLI. The pipeline decodes under a 16-wide + # ThreadPoolExecutor and the in-process decoders are unsafe there: + # torchcodec is not thread-safe and SIGSEGVs under concurrent decode + # (a crash no try/except can catch), PyAV can likewise segfault on + # AV1, and lerobot's ``pyav`` backend routes through the removed + # ``torchvision.io.VideoReader``. ``_decode_frames_ffmpeg`` shells + # out per frame: each decode is an isolated child process, so it is + # both crash-safe and concurrency-safe. ``video_backend`` can pin + # ``torchcodec`` / ``pyav`` explicitly for callers that know their + # build is safe. + chain = [self.video_backend] if self.video_backend else ["ffmpeg"] exc: Exception | None = None for backend in chain: