mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-24 04:59:47 +00:00
fix: route video_metadata through videos_kwargs for Qwen3/3.5 processors
The Qwen3VLProcessor distributes kwargs to sub-processors via _merge_kwargs. Flat kwargs like video_metadata and do_sample_frames were not reaching the video processor, causing fps to default to 24 and producing shape mismatches. Pass these kwargs explicitly under videos_kwargs so they reach Qwen3VLVideoProcessor directly. Revert Qwen2VL to its simpler original approach since its processor doesn't use videos_kwargs. Made-with: Cursor
This commit is contained in:
@@ -170,14 +170,11 @@ class Qwen2VL(BaseVLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
videos, video_metadata = _unpack_video_inputs(video_inputs)
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=videos,
|
videos=video_inputs,
|
||||||
video_metadata=video_metadata,
|
|
||||||
do_sample_frames=False,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -222,20 +219,17 @@ class Qwen2VL(BaseVLM):
|
|||||||
all_messages.append(messages)
|
all_messages.append(messages)
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
all_video_tuples = []
|
all_video_inputs = []
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_video_tuples.extend(video_inputs or [])
|
all_video_inputs.extend(video_inputs or [])
|
||||||
|
|
||||||
videos, video_metadata = _unpack_video_inputs(all_video_tuples or None)
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
videos=videos,
|
videos=all_video_inputs or None,
|
||||||
video_metadata=video_metadata,
|
|
||||||
do_sample_frames=False,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -350,8 +344,10 @@ class Qwen3VL(BaseVLM):
|
|||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=videos,
|
videos=videos,
|
||||||
video_metadata=video_metadata,
|
videos_kwargs={
|
||||||
do_sample_frames=False,
|
"video_metadata": video_metadata,
|
||||||
|
"do_sample_frames": False,
|
||||||
|
},
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -407,8 +403,10 @@ class Qwen3VL(BaseVLM):
|
|||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
videos=videos,
|
videos=videos,
|
||||||
video_metadata=video_metadata,
|
videos_kwargs={
|
||||||
do_sample_frames=False,
|
"video_metadata": video_metadata,
|
||||||
|
"do_sample_frames": False,
|
||||||
|
},
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -517,8 +515,10 @@ class Qwen35VL(BaseVLM):
|
|||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=videos,
|
videos=videos,
|
||||||
video_metadata=video_metadata,
|
videos_kwargs={
|
||||||
do_sample_frames=False,
|
"video_metadata": video_metadata,
|
||||||
|
"do_sample_frames": False,
|
||||||
|
},
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -575,8 +575,10 @@ class Qwen35VL(BaseVLM):
|
|||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
videos=videos,
|
videos=videos,
|
||||||
video_metadata=video_metadata,
|
videos_kwargs={
|
||||||
do_sample_frames=False,
|
"video_metadata": video_metadata,
|
||||||
|
"do_sample_frames": False,
|
||||||
|
},
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
|||||||
Reference in New Issue
Block a user