fix(annotate): decode keyframes via ffmpeg CLI fallback

PyAV segfaulted (exit 139) decoding the AV1 streams modern LeRobot
datasets use — a SIGSEGV that the per-episode try/except cannot catch,
killing the whole job when the interjections phase started.

Replace the PyAV fallback with _decode_frames_ffmpeg, which shells out
to the ffmpeg CLI: a full ffmpeg build decodes AV1, and a child-process
crash is a catchable non-zero exit rather than a segfault. Decoder chain
is now torchcodec -> ffmpeg. _decode_frames_av stays available behind
video_backend="pyav" for callers that want it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-18 16:08:31 +02:00
parent 7128bb1769
commit 1bd53cc7da
3 changed files with 86 additions and 19 deletions
@@ -189,11 +189,11 @@ class AnnotationPipelineConfig:
skip_validation: bool = False
only_episodes: tuple[int, ...] | None = None
# Video decode backend for keyframe extraction. When unset, decoding tries
# the platform default (torchcodec when installed) and falls back to
# ``pyav`` on failure. Set to ``"pyav"`` to skip torchcodec entirely —
# useful in containers where torchcodec cannot decode ("Operation not
# permitted").
# Keyframe decode backend. When unset, decoding tries the platform default
# (torchcodec when installed) then falls back to the ffmpeg CLI, which
# decodes AV1 and isolates crashes to a child process. Set to one of
# ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"`` to pin a single backend —
# e.g. ``"ffmpeg"`` in containers where torchcodec cannot decode.
video_backend: str | None = None
# When True, upload the annotated dataset to the Hugging Face Hub:
@@ -135,11 +135,11 @@ class VideoFrameProvider:
camera_key: str | None = None
tolerance_s: float = 1e-2
cache_size: int = 256
# Video decode backend forwarded to ``decode_video_frames``. When ``None``,
# decoding tries the platform default (torchcodec when installed) and
# falls back to ``pyav`` if it raises — some containers ship a torchcodec
# that cannot push packets to the decoder ("Operation not permitted").
# Set explicitly (e.g. ``"pyav"``) to skip that probe.
# Keyframe decode backend. When ``None``, decoding tries the platform
# default (torchcodec when installed) then falls back to the ffmpeg CLI.
# Set explicitly to one of ``"torchcodec"``, ``"ffmpeg"``, or ``"pyav"``
# to pin a single backend — e.g. ``"ffmpeg"`` to skip a torchcodec that
# cannot decode the dataset's codec ("Operation not permitted").
video_backend: str | None = None
_meta: Any = field(default=None, init=False, repr=False)
_cache: dict = field(default_factory=dict, init=False, repr=False)
@@ -303,19 +303,24 @@ class VideoFrameProvider:
shifted = [from_timestamp + ts for ts in timestamps]
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
# Build the decoder chain. torchcodec is fast but unusable in some
# containers (vllm-openai: "Operation not permitted"); lerobot's
# ``pyav`` backend routes through ``torchvision.io.VideoReader``,
# removed in torchvision 0.23+. ``_decode_frames_av`` talks to the
# ``av`` package directly and is the always-available fallback.
# Build the decoder chain. In-process decoders are fragile here:
# torchcodec raises in some containers (vllm-openai: "Operation not
# permitted"), lerobot's ``pyav`` backend routes through
# ``torchvision.io.VideoReader`` (removed in torchvision 0.23+), and
# PyAV can outright SIGSEGV on the AV1 streams modern LeRobot
# datasets use. ``_decode_frames_ffmpeg`` shells out to the ffmpeg
# CLI — it decodes AV1 and a crash stays isolated to the child
# process — so it is the always-available fallback.
if self.video_backend:
chain = [self.video_backend]
else:
chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["pyav"]
chain = (["torchcodec"] if get_safe_default_codec() == "torchcodec" else []) + ["ffmpeg"]
exc: Exception | None = None
for backend in chain:
try:
if backend == "ffmpeg":
return _decode_frames_ffmpeg(video_path, shifted)
if backend in ("pyav", "av"):
return _decode_frames_av(video_path, shifted)
# Stacked ``(N, C, H, W)`` uint8 tensor; one row per timestamp.
@@ -361,14 +366,52 @@ def make_frame_provider(
return provider
def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any]:
"""Decode the frames nearest to ``timestamps`` via the ffmpeg CLI.
Runs one ``ffmpeg`` process per timestamp, seeking with ``-ss`` and
piping a single PNG to stdout. Unlike the in-process decoders this
survives a hostile container: a full ffmpeg build decodes AV1 (the codec
modern LeRobot datasets use) where torchcodec raises and PyAV can
SIGSEGV, and a crash stays isolated to the child process a non-zero
exit is a catchable error, not a segfault of the whole job. Returns one
``(C, H, W)`` uint8 tensor per timestamp.
"""
import io # noqa: PLC0415
import subprocess # noqa: PLC0415
import numpy as np # noqa: PLC0415
frames: list[Any] = []
for ts in timestamps:
proc = subprocess.run(
[
"ffmpeg", "-nostdin", "-loglevel", "error",
"-ss", f"{max(ts, 0.0):.3f}",
"-i", str(video_path),
"-frames:v", "1",
"-f", "image2pipe", "-vcodec", "png", "pipe:1",
],
capture_output=True,
check=True,
timeout=120,
)
if not proc.stdout:
raise RuntimeError(f"ffmpeg returned no frame for t={ts:.3f}s of {video_path}")
img = PIL.Image.open(io.BytesIO(proc.stdout)).convert("RGB")
frames.append(torch.from_numpy(np.asarray(img).copy()).permute(2, 0, 1).contiguous())
return frames
def _decode_frames_av(video_path: Path, timestamps: list[float]) -> list[Any]:
"""Decode the frames nearest to ``timestamps`` using PyAV directly.
lerobot's ``decode_video_frames(backend="pyav")`` routes through
``torchvision.io.VideoReader``, removed in torchvision 0.23+. This helper
talks to the ``av`` package directly so keyframe extraction keeps working
on modern torch/torchvision stacks and in containers where torchcodec
cannot decode. Returns one ``(C, H, W)`` uint8 tensor per timestamp.
talks to the ``av`` package directly. Note PyAV can SIGSEGV on AV1
streams in some builds prefer ``_decode_frames_ffmpeg`` as the default
fallback; this stays available behind ``video_backend="pyav"``. Returns
one ``(C, H, W)`` uint8 tensor per timestamp.
"""
import av # noqa: PLC0415
+24
View File
@@ -41,6 +41,7 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da
from lerobot.annotations.steerable_pipeline.frames import ( # noqa: E402
VideoFrameProvider,
_decode_frames_av,
_decode_frames_ffmpeg,
)
@@ -120,3 +121,26 @@ def test_decode_frames_av_raises_on_missing_file(tmp_path: Path) -> None:
"""A missing video surfaces as an exception the caller can fall back on."""
with pytest.raises(Exception): # noqa: B017, PT011
_decode_frames_av(tmp_path / "does_not_exist.mp4", [0.0])
def test_decode_frames_ffmpeg_returns_one_uint8_frame_per_timestamp(sample_video: Path) -> None:
"""``_decode_frames_ffmpeg`` shells out to the ffmpeg CLI — the always-
available fallback that decodes AV1 and isolates crashes to a child
process.
"""
timestamps = [0.0, 1.0, 2.5]
frames = _decode_frames_ffmpeg(sample_video, timestamps)
assert len(frames) == len(timestamps)
for frame in frames:
assert isinstance(frame, torch.Tensor)
assert frame.dtype == torch.uint8
assert frame.shape == (3, 120, 160)
def test_decode_frames_ffmpeg_raises_on_missing_file(tmp_path: Path) -> None:
"""A missing video raises (non-zero ffmpeg exit), never crashes the job."""
if shutil.which("ffmpeg") is None:
pytest.skip("ffmpeg not available")
with pytest.raises(Exception): # noqa: B017, PT011
_decode_frames_ffmpeg(tmp_path / "does_not_exist.mp4", [0.0])