fix: pass video_metadata via process_vision_info for correct position embeddings

The Qwen3.5 processor needs video_metadata (fps, frame indices) to compute temporal position embeddings. Use return_video_metadata=True which embeds metadata inside the video tensors as (tensor, metadata) tuples, and return_video_kwargs=True which returns {'do_sample_frames': False} without the problematic fps list. Made-with: Cursor
2026-07-24 10:16:09 +00:00 · 2026-03-30 17:23:44 +02:00
parent 002a9dd0b9
commit 9a298524ca
1 changed files with 24 additions and 12 deletions
@@ -159,12 +159,14 @@ class Qwen2VL(BaseVLM):
        ]
        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = self.process_vision_info(messages)
+        image_inputs, video_inputs, video_kwargs = self.process_vision_info(
            messages, return_video_kwargs=True, return_video_metadata=True
        )
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
@@ -214,7 +216,9 @@ class Qwen2VL(BaseVLM):
        for messages in all_messages:
            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            image_inputs, video_inputs = self.process_vision_info(messages)
+            image_inputs, video_inputs, video_kwargs = self.process_vision_info(
                messages, return_video_kwargs=True, return_video_metadata=True
            )
            all_texts.append(text)
            all_image_inputs.extend(image_inputs or [])
            all_video_inputs.extend(video_inputs or [])
@@ -223,7 +227,7 @@ class Qwen2VL(BaseVLM):
            text=all_texts,
            images=all_image_inputs if all_image_inputs else None,
            videos=all_video_inputs if all_video_inputs else None,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
@@ -332,12 +336,14 @@ class Qwen3VL(BaseVLM):
        ]
        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = self.process_vision_info(messages)
+        image_inputs, video_inputs, video_kwargs = self.process_vision_info(
            messages, return_video_kwargs=True, return_video_metadata=True
        )
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
@@ -386,7 +392,9 @@ class Qwen3VL(BaseVLM):
        for messages in all_messages:
            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            image_inputs, video_inputs = self.process_vision_info(messages)
+            image_inputs, video_inputs, video_kwargs = self.process_vision_info(
                messages, return_video_kwargs=True, return_video_metadata=True
            )
            all_texts.append(text)
            all_image_inputs.extend(image_inputs or [])
            all_video_inputs.extend(video_inputs or [])
@@ -395,7 +403,7 @@ class Qwen3VL(BaseVLM):
            text=all_texts,
            images=all_image_inputs if all_image_inputs else None,
            videos=all_video_inputs if all_video_inputs else None,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
@@ -498,12 +506,14 @@ class Qwen35VL(BaseVLM):
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
        )
-        image_inputs, video_inputs = self.process_vision_info(messages)
+        image_inputs, video_inputs, video_kwargs = self.process_vision_info(
            messages, return_video_kwargs=True, return_video_metadata=True
        )
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
@@ -553,7 +563,9 @@ class Qwen35VL(BaseVLM):
            text = self.processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
            )
-            image_inputs, video_inputs = self.process_vision_info(messages)
+            image_inputs, video_inputs, video_kwargs = self.process_vision_info(
                messages, return_video_kwargs=True, return_video_metadata=True
            )
            all_texts.append(text)
            all_image_inputs.extend(image_inputs or [])
            all_video_inputs.extend(video_inputs or [])
@@ -562,7 +574,7 @@ class Qwen35VL(BaseVLM):
            text=all_texts,
            images=all_image_inputs if all_image_inputs else None,
            videos=all_video_inputs if all_video_inputs else None,
-            do_sample_frames=False,
+            **video_kwargs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)