From ac41cd6672a98cc245287b3167668d872cf15c5a Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 30 Mar 2026 16:03:26 +0200
Subject: [PATCH] fix: bypass torchcodec video decoding by pre-reading frames
 via cv2

When torchcodec is installed, qwen-vl-utils ignores the fps parameter
and defaults to 24fps if video metadata is missing, causing shape
mismatches. Fix by reading video frames directly as PIL images and
passing them to the processor, bypassing torchcodec entirely.

Made-with: Cursor
---
 .../data_annotations/vlm_annotations.py       | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py
index 237c8af51..010b26504 100644
--- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py
+++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py
@@ -19,7 +19,9 @@ import re
 from abc import ABC, abstractmethod
 from pathlib import Path
 
+import cv2
 import torch
+from PIL import Image
 
 from lerobot.data_processing.data_annotations.subtask_annotations import Skill
 from lerobot.utils.constants import (
@@ -87,6 +89,21 @@ class BaseVLM(ABC):
         pass
 
 
+def _load_video_frames(video_path: Path) -> list[Image.Image]:
+    """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults."""
+    frames: list[Image.Image] = []
+    cap = cv2.VideoCapture(str(video_path))
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+    finally:
+        cap.release()
+    return frames
+
+
 def create_skill_segmentation_prompt(
     coarse_goal: str | None = None,
     subtask_labels: list[str] | None = None,
@@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM):
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                     {
                         "type": "text",
                         "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                         {
                             "type": "text",
                             "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",