diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 237c8af51..010b26504 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -19,7 +19,9 @@ import re from abc import ABC, abstractmethod from pathlib import Path +import cv2 import torch +from PIL import Image from lerobot.data_processing.data_annotations.subtask_annotations import Skill from lerobot.utils.constants import ( @@ -87,6 +89,21 @@ class BaseVLM(ABC): pass +def _load_video_frames(video_path: Path) -> list[Image.Image]: + """Read all frames from a video as PIL Images, bypassing torchcodec's fps defaults.""" + frames: list[Image.Image] = [] + cap = cv2.VideoCapture(str(video_path)) + try: + while True: + ret, frame = cap.read() + if not ret: + break + frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) + finally: + cap.release() + return frames + + def create_skill_segmentation_prompt( coarse_goal: str | None = None, subtask_labels: list[str] | None = None, @@ -149,7 +166,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -198,7 +215,7 @@ class Qwen2VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -322,7 +339,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -370,7 +387,7 @@ class Qwen3VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.", @@ -486,7 +503,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.", @@ -534,7 +551,7 @@ class Qwen35VL(BaseVLM): { "role": "user", "content": [ - {"type": "video", "video": str(video_path), "fps": 1.0}, + {"type": "video", "video": _load_video_frames(video_path)}, { "type": "text", "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",