From 31e0c15e55167b8633099fdb9fa9e6ce5013d666 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 15:23:53 +0200 Subject: [PATCH] fix(annotate): pyav fallback when torchcodec keyframe decode fails VideoFrameProvider decoded keyframes via torchcodec only. Some containers (e.g. vllm-openai) ship a torchcodec that cannot push packets to the decoder ("Operation not permitted"), silently degrading interjection/vqa prompts to no visual context. _decode now retries with pyav when the default backend raises, and a new `video_backend` config field lets callers pin the backend explicitly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 7 ++ .../annotations/steerable_pipeline/frames.py | 75 +++++++++++++------ src/lerobot/scripts/lerobot_annotate.py | 4 +- 3 files changed, 61 insertions(+), 25 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 8ff8fce78..bb11d0e14 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -189,6 +189,13 @@ class AnnotationPipelineConfig: skip_validation: bool = False only_episodes: tuple[int, ...] | None = None + # Video decode backend for keyframe extraction. When unset, decoding tries + # the platform default (torchcodec when installed) and falls back to + # ``pyav`` on failure. Set to ``"pyav"`` to skip torchcodec entirely — + # useful in containers where torchcodec cannot decode ("Operation not + # permitted"). + video_backend: str | None = None + # When True, upload the annotated dataset to the Hugging Face Hub: # to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of # the two must be set for this to take effect. diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py index 51092b5a7..b2045fd7d 100644 --- a/src/lerobot/annotations/steerable_pipeline/frames.py +++ b/src/lerobot/annotations/steerable_pipeline/frames.py @@ -34,6 +34,7 @@ import PIL.Image import torch from lerobot.datasets.video_utils import decode_video_frames +from lerobot.utils.import_utils import get_safe_default_codec from .reader import EpisodeRecord @@ -134,6 +135,12 @@ class VideoFrameProvider: camera_key: str | None = None tolerance_s: float = 1e-2 cache_size: int = 256 + # Video decode backend forwarded to ``decode_video_frames``. When ``None``, + # decoding tries the platform default (torchcodec when installed) and + # falls back to ``pyav`` if it raises — some containers ship a torchcodec + # that cannot push packets to the decoder ("Operation not permitted"). + # Set explicitly (e.g. ``"pyav"``) to skip that probe. + video_backend: str | None = None _meta: Any = field(default=None, init=False, repr=False) _cache: dict = field(default_factory=dict, init=False, repr=False) _camera_keys: list[str] = field(default_factory=list, init=False, repr=False) @@ -296,35 +303,55 @@ class VideoFrameProvider: shifted = [from_timestamp + ts for ts in timestamps] video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key) - try: - # Stacked ``(N, C, H, W)`` uint8 tensor; one row per timestamp. - decoded = decode_video_frames(video_path, shifted, self.tolerance_s, return_uint8=True) - return list(decoded) - except Exception as exc: - # Log loudly the first time decoding fails so a silent - # vqa-module no-op (every prompt skipped because frames_at - # returned []) is debuggable from the job log instead of - # post-hoc parquet inspection. Subsequent failures stay quiet. - with self._lock: - already_warned = getattr(self, "_warned_decode_fail", False) - if not already_warned: - self._warned_decode_fail = True - if not already_warned: - logger.warning( - "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s: %s", - episode_index, - camera_key, - video_path, - exc, - exc_info=True, + # When no backend is pinned, try the platform default first and fall + # back to ``pyav`` if it raises — torchcodec is broken in some + # containers (e.g. vllm-openai), where pyav decodes the same file fine. + if self.video_backend: + backends: list[str | None] = [self.video_backend] + else: + backends = [None] + if get_safe_default_codec() != "pyav": + backends.append("pyav") + + exc: Exception | None = None + for backend in backends: + try: + # Stacked ``(N, C, H, W)`` uint8 tensor; one row per timestamp. + decoded = decode_video_frames( + video_path, shifted, self.tolerance_s, backend=backend, return_uint8=True ) - return [] + return list(decoded) + except Exception as e: # noqa: PERF203 + exc = e + + # Every backend raised. Log loudly the first time so a silent + # vqa-module no-op (every prompt skipped because frames_at returned + # []) is debuggable from the job log instead of post-hoc parquet + # inspection. Subsequent failures stay quiet. + with self._lock: + already_warned = getattr(self, "_warned_decode_fail", False) + if not already_warned: + self._warned_decode_fail = True + if not already_warned: + logger.warning( + "VideoFrameProvider._decode failed for episode=%s camera=%s " + "video_path=%s backends=%s: %s", + episode_index, + camera_key, + video_path, + backends, + exc, + exc_info=exc, + ) + return [] -def make_frame_provider(root: Path, camera_key: str | None = None) -> FrameProvider: +def make_frame_provider( + root: Path, camera_key: str | None = None, video_backend: str | None = None +) -> FrameProvider: """Build a :class:`VideoFrameProvider` if videos are present, else null.""" try: - provider = VideoFrameProvider(root=root, camera_key=camera_key) + provider = VideoFrameProvider(root=root, camera_key=camera_key, video_backend=video_backend) except Exception: return null_provider() if provider.camera_key is None: diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py index 5bc91a242..aefe0e9eb 100644 --- a/src/lerobot/scripts/lerobot_annotate.py +++ b/src/lerobot/scripts/lerobot_annotate.py @@ -64,7 +64,9 @@ def annotate(cfg: AnnotationPipelineConfig) -> None: logger.info("annotate: root=%s", root) vlm = make_vlm_client(cfg.vlm) - frame_provider = make_frame_provider(root, camera_key=cfg.vlm.camera_key) + frame_provider = make_frame_provider( + root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend + ) # Surface the resolved cameras up front so a silent vqa-module no-op # is obvious in job output rather than discovered post-hoc by counting # parquet rows.