From ac41cd6672a98cc245287b3167668d872cf15c5a Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 16:03:26 +0200 Subject: [PATCH] fix: bypass torchcodec video decoding by pre-reading frames via cv2 When torchcodec is installed, qwen-vl-utils ignores the fps parameter and defaults to 24fps if video metadata is missing, causing shape mismatches. Fix by reading video frames directly as PIL images and passing them to the processor, bypassing torchcodec entirely. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 237c8af51..010b26504 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -19,7 +19,9 @@ import re from abc import ABC, abstractmethod from pathlib import Path +import cv2 import torch +from PIL import Image from lerobot.data_processing.data_annotations.subtask_annotations import Skill from lerobot.utils.constants import ( @@ -87,6 +89,21 @@ class BaseVLM(ABC): pass +def _load_video_frames(video_path: Path) -> list[Image.Image]: + """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults.""" + frames: list[Image.Image] = [] + cap = cv2.VideoCapture(str(video_path)) + try: + while True: + ret, frame = cap.read() + if not ret: + break + frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) + finally: + cap.release() + return frames + + def create_skill_segmentation_prompt( coarse_goal: str | None = None, subtask_labels: list[str] | None = None, @@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",