fix: pass fps=1.0 scalar to processor instead of video_metadata tuples

The return_video_metadata=True approach causes 'list index out of range'
due to (tensor, metadata) tuple format issues. Since all extracted
videos are at 1fps (ffmpeg -r 1), directly pass fps=1.0 as a scalar
alongside do_sample_frames=False — this gives the processor the exact
fps for position embedding computation without format compatibility
issues across Qwen processor versions.

Made-with: Cursor
This commit is contained in:
Pepijn
2026-03-30 17:32:30 +02:00
parent 9a298524ca
commit 72692525da
@@ -159,14 +159,13 @@ class Qwen2VL(BaseVLM):
] ]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
inputs = self.processor( inputs = self.processor(
text=[text], text=[text],
images=image_inputs, images=image_inputs,
videos=video_inputs, videos=video_inputs,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)
@@ -216,9 +215,7 @@ class Qwen2VL(BaseVLM):
for messages in all_messages: for messages in all_messages:
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
all_texts.append(text) all_texts.append(text)
all_image_inputs.extend(image_inputs or []) all_image_inputs.extend(image_inputs or [])
all_video_inputs.extend(video_inputs or []) all_video_inputs.extend(video_inputs or [])
@@ -227,7 +224,8 @@ class Qwen2VL(BaseVLM):
text=all_texts, text=all_texts,
images=all_image_inputs if all_image_inputs else None, images=all_image_inputs if all_image_inputs else None,
videos=all_video_inputs if all_video_inputs else None, videos=all_video_inputs if all_video_inputs else None,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)
@@ -336,14 +334,13 @@ class Qwen3VL(BaseVLM):
] ]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
inputs = self.processor( inputs = self.processor(
text=[text], text=[text],
images=image_inputs, images=image_inputs,
videos=video_inputs, videos=video_inputs,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)
@@ -392,9 +389,7 @@ class Qwen3VL(BaseVLM):
for messages in all_messages: for messages in all_messages:
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
all_texts.append(text) all_texts.append(text)
all_image_inputs.extend(image_inputs or []) all_image_inputs.extend(image_inputs or [])
all_video_inputs.extend(video_inputs or []) all_video_inputs.extend(video_inputs or [])
@@ -403,7 +398,8 @@ class Qwen3VL(BaseVLM):
text=all_texts, text=all_texts,
images=all_image_inputs if all_image_inputs else None, images=all_image_inputs if all_image_inputs else None,
videos=all_video_inputs if all_video_inputs else None, videos=all_video_inputs if all_video_inputs else None,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)
@@ -506,14 +502,13 @@ class Qwen35VL(BaseVLM):
text = self.processor.apply_chat_template( text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
) )
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
inputs = self.processor( inputs = self.processor(
text=[text], text=[text],
images=image_inputs, images=image_inputs,
videos=video_inputs, videos=video_inputs,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)
@@ -563,9 +558,7 @@ class Qwen35VL(BaseVLM):
text = self.processor.apply_chat_template( text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
) )
image_inputs, video_inputs, video_kwargs = self.process_vision_info( image_inputs, video_inputs = self.process_vision_info(messages)
messages, return_video_kwargs=True, return_video_metadata=True
)
all_texts.append(text) all_texts.append(text)
all_image_inputs.extend(image_inputs or []) all_image_inputs.extend(image_inputs or [])
all_video_inputs.extend(video_inputs or []) all_video_inputs.extend(video_inputs or [])
@@ -574,7 +567,8 @@ class Qwen35VL(BaseVLM):
text=all_texts, text=all_texts,
images=all_image_inputs if all_image_inputs else None, images=all_image_inputs if all_image_inputs else None,
videos=all_video_inputs if all_video_inputs else None, videos=all_video_inputs if all_video_inputs else None,
**video_kwargs, do_sample_frames=False,
fps=1.0,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(self.device) ).to(self.device)