mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 14:09:47 +00:00
fix(annotate): pyav fallback when torchcodec keyframe decode fails
VideoFrameProvider decoded keyframes via torchcodec only. Some containers
(e.g. vllm-openai) ship a torchcodec that cannot push packets to the
decoder ("Operation not permitted"), silently degrading interjection/vqa
prompts to no visual context.
_decode now retries with pyav when the default backend raises, and a new
`video_backend` config field lets callers pin the backend explicitly.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -189,6 +189,13 @@ class AnnotationPipelineConfig:
|
|||||||
skip_validation: bool = False
|
skip_validation: bool = False
|
||||||
only_episodes: tuple[int, ...] | None = None
|
only_episodes: tuple[int, ...] | None = None
|
||||||
|
|
||||||
|
# Video decode backend for keyframe extraction. When unset, decoding tries
|
||||||
|
# the platform default (torchcodec when installed) and falls back to
|
||||||
|
# ``pyav`` on failure. Set to ``"pyav"`` to skip torchcodec entirely —
|
||||||
|
# useful in containers where torchcodec cannot decode ("Operation not
|
||||||
|
# permitted").
|
||||||
|
video_backend: str | None = None
|
||||||
|
|
||||||
# When True, upload the annotated dataset to the Hugging Face Hub:
|
# When True, upload the annotated dataset to the Hugging Face Hub:
|
||||||
# to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of
|
# to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of
|
||||||
# the two must be set for this to take effect.
|
# the two must be set for this to take effect.
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import PIL.Image
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from lerobot.datasets.video_utils import decode_video_frames
|
from lerobot.datasets.video_utils import decode_video_frames
|
||||||
|
from lerobot.utils.import_utils import get_safe_default_codec
|
||||||
|
|
||||||
from .reader import EpisodeRecord
|
from .reader import EpisodeRecord
|
||||||
|
|
||||||
@@ -134,6 +135,12 @@ class VideoFrameProvider:
|
|||||||
camera_key: str | None = None
|
camera_key: str | None = None
|
||||||
tolerance_s: float = 1e-2
|
tolerance_s: float = 1e-2
|
||||||
cache_size: int = 256
|
cache_size: int = 256
|
||||||
|
# Video decode backend forwarded to ``decode_video_frames``. When ``None``,
|
||||||
|
# decoding tries the platform default (torchcodec when installed) and
|
||||||
|
# falls back to ``pyav`` if it raises — some containers ship a torchcodec
|
||||||
|
# that cannot push packets to the decoder ("Operation not permitted").
|
||||||
|
# Set explicitly (e.g. ``"pyav"``) to skip that probe.
|
||||||
|
video_backend: str | None = None
|
||||||
_meta: Any = field(default=None, init=False, repr=False)
|
_meta: Any = field(default=None, init=False, repr=False)
|
||||||
_cache: dict = field(default_factory=dict, init=False, repr=False)
|
_cache: dict = field(default_factory=dict, init=False, repr=False)
|
||||||
_camera_keys: list[str] = field(default_factory=list, init=False, repr=False)
|
_camera_keys: list[str] = field(default_factory=list, init=False, repr=False)
|
||||||
@@ -296,35 +303,55 @@ class VideoFrameProvider:
|
|||||||
shifted = [from_timestamp + ts for ts in timestamps]
|
shifted = [from_timestamp + ts for ts in timestamps]
|
||||||
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
|
video_path = self.root / self._meta.get_video_file_path(episode_index, camera_key)
|
||||||
|
|
||||||
try:
|
# When no backend is pinned, try the platform default first and fall
|
||||||
# Stacked ``(N, C, H, W)`` uint8 tensor; one row per timestamp.
|
# back to ``pyav`` if it raises — torchcodec is broken in some
|
||||||
decoded = decode_video_frames(video_path, shifted, self.tolerance_s, return_uint8=True)
|
# containers (e.g. vllm-openai), where pyav decodes the same file fine.
|
||||||
return list(decoded)
|
if self.video_backend:
|
||||||
except Exception as exc:
|
backends: list[str | None] = [self.video_backend]
|
||||||
# Log loudly the first time decoding fails so a silent
|
else:
|
||||||
# vqa-module no-op (every prompt skipped because frames_at
|
backends = [None]
|
||||||
# returned []) is debuggable from the job log instead of
|
if get_safe_default_codec() != "pyav":
|
||||||
# post-hoc parquet inspection. Subsequent failures stay quiet.
|
backends.append("pyav")
|
||||||
with self._lock:
|
|
||||||
already_warned = getattr(self, "_warned_decode_fail", False)
|
exc: Exception | None = None
|
||||||
if not already_warned:
|
for backend in backends:
|
||||||
self._warned_decode_fail = True
|
try:
|
||||||
if not already_warned:
|
# Stacked ``(N, C, H, W)`` uint8 tensor; one row per timestamp.
|
||||||
logger.warning(
|
decoded = decode_video_frames(
|
||||||
"VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s: %s",
|
video_path, shifted, self.tolerance_s, backend=backend, return_uint8=True
|
||||||
episode_index,
|
|
||||||
camera_key,
|
|
||||||
video_path,
|
|
||||||
exc,
|
|
||||||
exc_info=True,
|
|
||||||
)
|
)
|
||||||
return []
|
return list(decoded)
|
||||||
|
except Exception as e: # noqa: PERF203
|
||||||
|
exc = e
|
||||||
|
|
||||||
|
# Every backend raised. Log loudly the first time so a silent
|
||||||
|
# vqa-module no-op (every prompt skipped because frames_at returned
|
||||||
|
# []) is debuggable from the job log instead of post-hoc parquet
|
||||||
|
# inspection. Subsequent failures stay quiet.
|
||||||
|
with self._lock:
|
||||||
|
already_warned = getattr(self, "_warned_decode_fail", False)
|
||||||
|
if not already_warned:
|
||||||
|
self._warned_decode_fail = True
|
||||||
|
if not already_warned:
|
||||||
|
logger.warning(
|
||||||
|
"VideoFrameProvider._decode failed for episode=%s camera=%s "
|
||||||
|
"video_path=%s backends=%s: %s",
|
||||||
|
episode_index,
|
||||||
|
camera_key,
|
||||||
|
video_path,
|
||||||
|
backends,
|
||||||
|
exc,
|
||||||
|
exc_info=exc,
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def make_frame_provider(root: Path, camera_key: str | None = None) -> FrameProvider:
|
def make_frame_provider(
|
||||||
|
root: Path, camera_key: str | None = None, video_backend: str | None = None
|
||||||
|
) -> FrameProvider:
|
||||||
"""Build a :class:`VideoFrameProvider` if videos are present, else null."""
|
"""Build a :class:`VideoFrameProvider` if videos are present, else null."""
|
||||||
try:
|
try:
|
||||||
provider = VideoFrameProvider(root=root, camera_key=camera_key)
|
provider = VideoFrameProvider(root=root, camera_key=camera_key, video_backend=video_backend)
|
||||||
except Exception:
|
except Exception:
|
||||||
return null_provider()
|
return null_provider()
|
||||||
if provider.camera_key is None:
|
if provider.camera_key is None:
|
||||||
|
|||||||
@@ -64,7 +64,9 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
|
|||||||
logger.info("annotate: root=%s", root)
|
logger.info("annotate: root=%s", root)
|
||||||
|
|
||||||
vlm = make_vlm_client(cfg.vlm)
|
vlm = make_vlm_client(cfg.vlm)
|
||||||
frame_provider = make_frame_provider(root, camera_key=cfg.vlm.camera_key)
|
frame_provider = make_frame_provider(
|
||||||
|
root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend
|
||||||
|
)
|
||||||
# Surface the resolved cameras up front so a silent vqa-module no-op
|
# Surface the resolved cameras up front so a silent vqa-module no-op
|
||||||
# is obvious in job output rather than discovered post-hoc by counting
|
# is obvious in job output rather than discovered post-hoc by counting
|
||||||
# parquet rows.
|
# parquet rows.
|
||||||
|
|||||||
Reference in New Issue
Block a user