From d03200bdb3b807d19d0c90365b9779842bce836e Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 30 Mar 2026 16:42:52 +0200
Subject: [PATCH] fix: force torchvision video backend instead of cv2 bypass

Replace manual cv2 frame reading with FORCE_QWENVL_VIDEO_READER=torchvision
env var. The torchvision backend (PyAV) properly reads video metadata and
respects the fps parameter, avoiding the torchcodec fps=24 default issue.

Made-with: Cursor
---
 .../data_annotations/vlm_annotations.py       | 32 ++++++-------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py
index 010b26504..a7702a422 100644
--- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py
+++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py
@@ -15,13 +15,14 @@
 # VLM Interface (Abstract Base Class for Modularity)
 
 import json
+import os
 import re
 from abc import ABC, abstractmethod
 from pathlib import Path
 
-import cv2
 import torch
-from PIL import Image
+
+os.environ.setdefault("FORCE_QWENVL_VIDEO_READER", "torchvision")
 
 from lerobot.data_processing.data_annotations.subtask_annotations import Skill
 from lerobot.utils.constants import (
@@ -89,21 +90,6 @@ class BaseVLM(ABC):
         pass
 
 
-def _load_video_frames(video_path: Path) -> list[Image.Image]:
-    """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults."""
-    frames: list[Image.Image] = []
-    cap = cv2.VideoCapture(str(video_path))
-    try:
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
-    finally:
-        cap.release()
-    return frames
-
-
 def create_skill_segmentation_prompt(
     coarse_goal: str | None = None,
     subtask_labels: list[str] | None = None,
@@ -166,7 +152,7 @@ class Qwen2VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": _load_video_frames(video_path)},
+                    {"type": "video", "video": str(video_path), "fps": 1.0},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -215,7 +201,7 @@ class Qwen2VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": _load_video_frames(video_path)},
+                        {"type": "video", "video": str(video_path), "fps": 1.0},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -339,7 +325,7 @@ class Qwen3VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": _load_video_frames(video_path)},
+                    {"type": "video", "video": str(video_path), "fps": 1.0},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -387,7 +373,7 @@ class Qwen3VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": _load_video_frames(video_path)},
+                        {"type": "video", "video": str(video_path), "fps": 1.0},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -503,7 +489,7 @@ class Qwen35VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": _load_video_frames(video_path)},
+                    {"type": "video", "video": str(video_path), "fps": 1.0},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -551,7 +537,7 @@ class Qwen35VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": _load_video_frames(video_path)},
+                        {"type": "video", "video": str(video_path), "fps": 1.0},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",