mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-18 10:10:08 +00:00
fix: bypass torchcodec video decoding by pre-reading frames via cv2
When torchcodec is installed, qwen-vl-utils ignores the fps parameter and defaults to 24fps if video metadata is missing, causing shape mismatches. Fix by reading video frames directly as PIL images and passing them to the processor, bypassing torchcodec entirely. Made-with: Cursor
This commit is contained in:
@@ -19,7 +19,9 @@ import re
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from lerobot.data_processing.data_annotations.subtask_annotations import Skill
|
||||
from lerobot.utils.constants import (
|
||||
@@ -87,6 +89,21 @@ class BaseVLM(ABC):
|
||||
pass
|
||||
|
||||
|
||||
def _load_video_frames(video_path: Path) -> list[Image.Image]:
|
||||
"""Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults."""
|
||||
frames: list[Image.Image] = []
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
try:
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
|
||||
finally:
|
||||
cap.release()
|
||||
return frames
|
||||
|
||||
|
||||
def create_skill_segmentation_prompt(
|
||||
coarse_goal: str | None = None,
|
||||
subtask_labels: list[str] | None = None,
|
||||
@@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
|
||||
@@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
|
||||
@@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
|
||||
@@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
|
||||
@@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
|
||||
@@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": str(video_path), "fps": 1.0},
|
||||
{"type": "video", "video": _load_video_frames(video_path)},
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
|
||||
|
||||
Reference in New Issue
Block a user