From d03200bdb3b807d19d0c90365b9779842bce836e Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 16:42:52 +0200 Subject: [PATCH] fix: force torchvision video backend instead of cv2 bypass Replace manual cv2 frame reading with FORCE_QWENVL_VIDEO_READER=torchvision env var. The torchvision backend (PyAV) properly reads video metadata and respects the fps parameter, avoiding the torchcodec fps=24 default issue. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 32 ++++++------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 010b26504..a7702a422 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -15,13 +15,14 @@ # VLM Interface (Abstract Base Class for Modularity) import json +import os import re from abc import ABC, abstractmethod from pathlib import Path -import cv2 import torch -from PIL import Image + +os.environ.setdefault("FORCE_QWENVL_VIDEO_READER", "torchvision") from lerobot.data_processing.data_annotations.subtask_annotations import Skill from lerobot.utils.constants import ( @@ -89,21 +90,6 @@ class BaseVLM(ABC): pass -def _load_video_frames(video_path: Path) -> list[Image.Image]: - """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults.""" - frames: list[Image.Image] = [] - cap = cv2.VideoCapture(str(video_path)) - try: - while True: - ret, frame = cap.read() - if not ret: - break - frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) - finally: - cap.release() - return frames - - def create_skill_segmentation_prompt( coarse_goal: str | None = None, subtask_labels: list[str] | None = None, @@ -166,7 +152,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -215,7 +201,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -339,7 +325,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -387,7 +373,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -503,7 +489,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -551,7 +537,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": _load_video_frames(video_path)}, + {"type": "video", "video": str(video_path), "fps": 1.0}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",