fix: bypass torchcodec video decoding by pre-reading frames via cv2

When torchcodec is installed, qwen-vl-utils ignores the fps parameter
and defaults to 24fps if video metadata is missing, causing shape
mismatches. Fix by reading video frames directly as PIL images and
passing them to the processor, bypassing torchcodec entirely.

Made-with: Cursor
This commit is contained in:
Pepijn
2026-03-30 16:03:26 +02:00
parent 9b211a45d6
commit ac41cd6672
@@ -19,7 +19,9 @@ import re
from abc import ABC, abstractmethod
from pathlib import Path
import cv2
import torch
from PIL import Image
from lerobot.data_processing.data_annotations.subtask_annotations import Skill
from lerobot.utils.constants import (
@@ -87,6 +89,21 @@ class BaseVLM(ABC):
pass
def _load_video_frames(video_path: Path) -> list[Image.Image]:
"""Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults."""
frames: list[Image.Image] = []
cap = cv2.VideoCapture(str(video_path))
try:
while True:
ret, frame = cap.read()
if not ret:
break
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
finally:
cap.release()
return frames
def create_skill_segmentation_prompt(
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
@@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
@@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
@@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM):
{
"role": "user",
"content": [
{"type": "video", "video": str(video_path), "fps": 1.0},
{"type": "video", "video": _load_video_frames(video_path)},
{
"type": "text",
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",