From 9a298524caf2a893e40fe0e93f27addb5151841d Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 17:23:44 +0200 Subject: [PATCH] fix: pass video_metadata via process_vision_info for correct position embeddings The Qwen3.5 processor needs video_metadata (fps, frame indices) to compute temporal position embeddings. Use return_video_metadata=True which embeds metadata inside the video tensors as (tensor, metadata) tuples, and return_video_kwargs=True which returns {'do_sample_frames': False} without the problematic fps list. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 3ba12c919..7f4c47781 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -159,12 +159,14 @@ class Qwen2VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device) @@ -214,7 +216,9 @@ class Qwen2VL(BaseVLM): for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -223,7 +227,7 @@ class Qwen2VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device) @@ -332,12 +336,14 @@ class Qwen3VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device) @@ -386,7 +392,9 @@ class Qwen3VL(BaseVLM): for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -395,7 +403,7 @@ class Qwen3VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device) @@ -498,12 +506,14 @@ class Qwen35VL(BaseVLM): text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device) @@ -553,7 +563,9 @@ class Qwen35VL(BaseVLM): text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs = self.process_vision_info(messages) + image_inputs, video_inputs, video_kwargs = self.process_vision_info( + messages, return_video_kwargs=True, return_video_metadata=True + ) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -562,7 +574,7 @@ class Qwen35VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - do_sample_frames=False, + **video_kwargs, padding=True, return_tensors="pt", ).to(self.device)