mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-24 13:09:43 +00:00
fix(annotate): decode video frames with PyAV directly
``lerobot.datasets.video_utils.decode_video_frames`` routes ``backend="pyav"`` through ``decode_video_frames_torchvision`` → ``torchvision.io.VideoReader``, but ``VideoReader`` was removed in torchvision >= 0.22 (the vllm/vllm-openai:latest container ships with torchvision 0.25). That made every Module 3 frame decode raise ``AttributeError: module 'torchvision.io' has no attribute 'VideoReader'``, which the previous catch-all silently turned into an empty image list, which then made every Module 3 prompt skip via the ``not _has_image_block(messages)`` branch and produce zero VQA rows. Bypass ``video_utils`` entirely. The annotation pipeline only needs a handful of PIL frames per (episode, ts), so a direct PyAV decode is both simpler and insulated from torchvision API churn. ``av`` is already in the install set, no new dependency. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -185,29 +185,13 @@ class VideoFrameProvider:
|
|||||||
def _decode(
|
def _decode(
|
||||||
self, episode_index: int, timestamps: list[float], camera_key: str
|
self, episode_index: int, timestamps: list[float], camera_key: str
|
||||||
) -> list[Any]:
|
) -> list[Any]:
|
||||||
import os as _os # noqa: PLC0415
|
|
||||||
|
|
||||||
from PIL import Image # noqa: PLC0415
|
|
||||||
|
|
||||||
from lerobot.datasets.video_utils import decode_video_frames # noqa: PLC0415
|
|
||||||
|
|
||||||
ep = self._meta.episodes[episode_index]
|
ep = self._meta.episodes[episode_index]
|
||||||
from_timestamp = ep[f"videos/{camera_key}/from_timestamp"]
|
from_timestamp = ep[f"videos/{camera_key}/from_timestamp"]
|
||||||
shifted = [from_timestamp + ts for ts in timestamps]
|
shifted = [from_timestamp + ts for ts in timestamps]
|
||||||
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
|
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
|
||||||
# ``torchcodec`` import currently bad-allocs on cu128/torch-2.8 in
|
|
||||||
# some environments; default to ``pyav`` (always available via
|
|
||||||
# the ``av`` package) and let users override with
|
|
||||||
# LEROBOT_VIDEO_BACKEND=torchcodec when their stack supports it.
|
|
||||||
backend = _os.environ.get("LEROBOT_VIDEO_BACKEND", "pyav")
|
|
||||||
try:
|
try:
|
||||||
frames = decode_video_frames(
|
return _decode_pyav_direct(video_path, shifted, self.tolerance_s)
|
||||||
video_path,
|
|
||||||
shifted,
|
|
||||||
self.tolerance_s,
|
|
||||||
backend=backend,
|
|
||||||
return_uint8=True,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
# Log loudly the first time decoding fails so silent
|
# Log loudly the first time decoding fails so silent
|
||||||
# Module-3-no-op (every prompt skipped because frames_at returned
|
# Module-3-no-op (every prompt skipped because frames_at returned
|
||||||
@@ -218,24 +202,79 @@ class VideoFrameProvider:
|
|||||||
|
|
||||||
logging.getLogger(__name__).warning(
|
logging.getLogger(__name__).warning(
|
||||||
"VideoFrameProvider._decode failed for episode=%s camera=%s "
|
"VideoFrameProvider._decode failed for episode=%s camera=%s "
|
||||||
"video_path=%s backend=%s: %s",
|
"video_path=%s: %s",
|
||||||
episode_index,
|
episode_index,
|
||||||
camera_key,
|
camera_key,
|
||||||
video_path,
|
video_path,
|
||||||
backend,
|
|
||||||
exc,
|
exc,
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
self._warned_decode_fail = True
|
self._warned_decode_fail = True
|
||||||
return []
|
return []
|
||||||
# frames: [N, C, H, W] uint8, RGB
|
|
||||||
out: list[Any] = []
|
|
||||||
arr = frames.cpu().numpy() if hasattr(frames, "cpu") else frames
|
def _decode_pyav_direct(
|
||||||
for i in range(arr.shape[0]):
|
video_path: Any, timestamps: list[float], tolerance_s: float
|
||||||
chw = arr[i]
|
) -> list[Any]:
|
||||||
hwc = chw.transpose(1, 2, 0)
|
"""Decode the requested timestamps from ``video_path`` using PyAV directly.
|
||||||
out.append(Image.fromarray(hwc, mode="RGB"))
|
|
||||||
return out
|
Bypasses ``lerobot.datasets.video_utils.decode_video_frames`` entirely
|
||||||
|
because its "pyav" path actually goes through
|
||||||
|
``decode_video_frames_torchvision`` → ``torchvision.io.VideoReader``,
|
||||||
|
which was removed in torchvision >= 0.22 (the vllm/vllm-openai:latest
|
||||||
|
container ships with torchvision 0.25). The annotation pipeline only
|
||||||
|
needs a handful of PIL images per (episode, ts), so we can decode them
|
||||||
|
with PyAV without any torch dependency at all.
|
||||||
|
|
||||||
|
Returns one ``PIL.Image`` per requested timestamp, in the same order.
|
||||||
|
Any timestamp the decoder couldn't reach is silently dropped (mirrors
|
||||||
|
the previous behaviour); callers filter ``None``/missing entries.
|
||||||
|
"""
|
||||||
|
import av # noqa: PLC0415
|
||||||
|
from PIL import Image # noqa: PLC0415
|
||||||
|
|
||||||
|
if not timestamps:
|
||||||
|
return []
|
||||||
|
|
||||||
|
targets = sorted(set(timestamps))
|
||||||
|
seek_to = max(0.0, min(targets) - max(0.5, tolerance_s))
|
||||||
|
|
||||||
|
container = av.open(str(video_path))
|
||||||
|
try:
|
||||||
|
stream = container.streams.video[0]
|
||||||
|
# PyAV needs the seek target in stream timebase ticks.
|
||||||
|
if stream.time_base is None:
|
||||||
|
seek_pts = 0
|
||||||
|
else:
|
||||||
|
seek_pts = int(seek_to / float(stream.time_base))
|
||||||
|
try:
|
||||||
|
container.seek(seek_pts, any_frame=False, backward=True, stream=stream)
|
||||||
|
except av.AVError:
|
||||||
|
# Some streams reject the explicit seek; fall back to decoding from start.
|
||||||
|
container.seek(0)
|
||||||
|
|
||||||
|
results: dict[float, Any] = {}
|
||||||
|
target_iter = iter(targets)
|
||||||
|
next_target = next(target_iter, None)
|
||||||
|
for frame in container.decode(stream):
|
||||||
|
if next_target is None:
|
||||||
|
break
|
||||||
|
ts = float(frame.pts * frame.time_base) if frame.pts is not None else None
|
||||||
|
if ts is None:
|
||||||
|
continue
|
||||||
|
# Walk past targets we've already overshot — we keep the closest
|
||||||
|
# frame within tolerance.
|
||||||
|
while next_target is not None and ts >= next_target - tolerance_s:
|
||||||
|
if abs(ts - next_target) <= tolerance_s or ts >= next_target:
|
||||||
|
img = frame.to_image() # PIL.Image.Image (RGB)
|
||||||
|
results.setdefault(next_target, img)
|
||||||
|
next_target = next(target_iter, None)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
finally:
|
||||||
|
container.close()
|
||||||
|
|
||||||
|
return [results[ts] for ts in timestamps if ts in results]
|
||||||
|
|
||||||
def video_for_episode(
|
def video_for_episode(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user