From 0f6e3230df65e1399b52d34bb10e8f4d131aba35 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 30 Apr 2026 14:19:25 +0200
Subject: [PATCH] fix(annotate): decode video frames with PyAV directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``lerobot.datasets.video_utils.decode_video_frames`` routes
``backend="pyav"`` through ``decode_video_frames_torchvision`` →
``torchvision.io.VideoReader``, but ``VideoReader`` was removed in
torchvision >= 0.22 (the vllm/vllm-openai:latest container ships with
torchvision 0.25). That made every Module 3 frame decode raise
``AttributeError: module 'torchvision.io' has no attribute 'VideoReader'``,
which the previous catch-all silently turned into an empty image list,
which then made every Module 3 prompt skip via the
``not _has_image_block(messages)`` branch and produce zero VQA rows.

Bypass ``video_utils`` entirely. The annotation pipeline only needs
a handful of PIL frames per (episode, ts), so a direct PyAV decode is
both simpler and insulated from torchvision API churn. ``av`` is already
in the install set, no new dependency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/frames.py  | 95 +++++++++++++------
 1 file changed, 67 insertions(+), 28 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py
index 7a37bc2c4..20a6b6081 100644
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -185,29 +185,13 @@ class VideoFrameProvider:
     def _decode(
         self, episode_index: int, timestamps: list[float], camera_key: str
     ) -> list[Any]:
-        import os as _os  # noqa: PLC0415
-
-        from PIL import Image  # noqa: PLC0415
-
-        from lerobot.datasets.video_utils import decode_video_frames  # noqa: PLC0415
-
         ep = self._meta.episodes[episode_index]
         from_timestamp = ep[f"videos/{camera_key}/from_timestamp"]
         shifted = [from_timestamp + ts for ts in timestamps]
         video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
-        # ``torchcodec`` import currently bad-allocs on cu128/torch-2.8 in
-        # some environments; default to ``pyav`` (always available via
-        # the ``av`` package) and let users override with
-        # LEROBOT_VIDEO_BACKEND=torchcodec when their stack supports it.
-        backend = _os.environ.get("LEROBOT_VIDEO_BACKEND", "pyav")
+
         try:
-            frames = decode_video_frames(
-                video_path,
-                shifted,
-                self.tolerance_s,
-                backend=backend,
-                return_uint8=True,
-            )
+            return _decode_pyav_direct(video_path, shifted, self.tolerance_s)
         except Exception as exc:
             # Log loudly the first time decoding fails so silent
             # Module-3-no-op (every prompt skipped because frames_at returned
@@ -218,24 +202,79 @@ class VideoFrameProvider:
 
                 logging.getLogger(__name__).warning(
                     "VideoFrameProvider._decode failed for episode=%s camera=%s "
-                    "video_path=%s backend=%s: %s",
+                    "video_path=%s: %s",
                     episode_index,
                     camera_key,
                     video_path,
-                    backend,
                     exc,
                     exc_info=True,
                 )
                 self._warned_decode_fail = True
             return []
-        # frames: [N, C, H, W] uint8, RGB
-        out: list[Any] = []
-        arr = frames.cpu().numpy() if hasattr(frames, "cpu") else frames
-        for i in range(arr.shape[0]):
-            chw = arr[i]
-            hwc = chw.transpose(1, 2, 0)
-            out.append(Image.fromarray(hwc, mode="RGB"))
-        return out
+
+
+def _decode_pyav_direct(
+    video_path: Any, timestamps: list[float], tolerance_s: float
+) -> list[Any]:
+    """Decode the requested timestamps from ``video_path`` using PyAV directly.
+
+    Bypasses ``lerobot.datasets.video_utils.decode_video_frames`` entirely
+    because its "pyav" path actually goes through
+    ``decode_video_frames_torchvision`` → ``torchvision.io.VideoReader``,
+    which was removed in torchvision >= 0.22 (the vllm/vllm-openai:latest
+    container ships with torchvision 0.25). The annotation pipeline only
+    needs a handful of PIL images per (episode, ts), so we can decode them
+    with PyAV without any torch dependency at all.
+
+    Returns one ``PIL.Image`` per requested timestamp, in the same order.
+    Any timestamp the decoder couldn't reach is silently dropped (mirrors
+    the previous behaviour); callers filter ``None``/missing entries.
+    """
+    import av  # noqa: PLC0415
+    from PIL import Image  # noqa: PLC0415
+
+    if not timestamps:
+        return []
+
+    targets = sorted(set(timestamps))
+    seek_to = max(0.0, min(targets) - max(0.5, tolerance_s))
+
+    container = av.open(str(video_path))
+    try:
+        stream = container.streams.video[0]
+        # PyAV needs the seek target in stream timebase ticks.
+        if stream.time_base is None:
+            seek_pts = 0
+        else:
+            seek_pts = int(seek_to / float(stream.time_base))
+        try:
+            container.seek(seek_pts, any_frame=False, backward=True, stream=stream)
+        except av.AVError:
+            # Some streams reject the explicit seek; fall back to decoding from start.
+            container.seek(0)
+
+        results: dict[float, Any] = {}
+        target_iter = iter(targets)
+        next_target = next(target_iter, None)
+        for frame in container.decode(stream):
+            if next_target is None:
+                break
+            ts = float(frame.pts * frame.time_base) if frame.pts is not None else None
+            if ts is None:
+                continue
+            # Walk past targets we've already overshot — we keep the closest
+            # frame within tolerance.
+            while next_target is not None and ts >= next_target - tolerance_s:
+                if abs(ts - next_target) <= tolerance_s or ts >= next_target:
+                    img = frame.to_image()  # PIL.Image.Image (RGB)
+                    results.setdefault(next_target, img)
+                    next_target = next(target_iter, None)
+                else:
+                    break
+    finally:
+        container.close()
+
+    return [results[ts] for ts in timestamps if ts in results]
 
     def video_for_episode(
         self,