fix: bypass torchcodec video decoding by pre-reading frames via cv2

When torchcodec is installed, qwen-vl-utils ignores the fps parameter and defaults to 24fps if video metadata is missing, causing shape mismatches. Fix by reading video frames directly as PIL images and passing them to the processor, bypassing torchcodec entirely. Made-with: Cursor
2026-07-12 20:41:58 +00:00 · 2026-03-30 16:03:26 +02:00
parent 9b211a45d6
commit ac41cd6672
1 changed files with 23 additions and 6 deletions
@@ -19,7 +19,9 @@ import re
 from abc import ABC, abstractmethod
 from pathlib import Path

+import cv2
 import torch
+from PIL import Image

 from lerobot.data_processing.data_annotations.subtask_annotations import Skill
 from lerobot.utils.constants import (
@@ -87,6 +89,21 @@ class BaseVLM(ABC):
        pass


+def _load_video_frames(video_path: Path) -> list[Image.Image]:
+    """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults."""
+    frames: list[Image.Image] = []
+    cap = cv2.VideoCapture(str(video_path))
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+    finally:
+        cap.release()
+    return frames
+
+
 def create_skill_segmentation_prompt(
    coarse_goal: str | None = None,
    subtask_labels: list[str] | None = None,
@@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM):
            {
                "role": "user",
                "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                    {
                        "type": "text",
                        "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM):
                {
                    "role": "user",
                    "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                        {
                            "type": "text",
                            "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM):
            {
                "role": "user",
                "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                    {
                        "type": "text",
                        "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM):
                {
                    "role": "user",
                    "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                        {
                            "type": "text",
                            "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM):
            {
                "role": "user",
                "content": [
-                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {"type": "video", "video": _load_video_frames(video_path)},
                    {
                        "type": "text",
                        "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM):
                {
                    "role": "user",
                    "content": [
-                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {"type": "video", "video": _load_video_frames(video_path)},
                        {
                            "type": "text",
                            "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",