From 2545f1a8edbafc5fd457115e1d1d6d810774988a Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 19:09:46 +0200 Subject: [PATCH] fix: route video_metadata through videos_kwargs for Qwen3/3.5 processors The Qwen3VLProcessor distributes kwargs to sub-processors via _merge_kwargs. Flat kwargs like video_metadata and do_sample_frames were not reaching the video processor, causing fps to default to 24 and producing shape mismatches. Pass these kwargs explicitly under videos_kwargs so they reach Qwen3VLVideoProcessor directly. Revert Qwen2VL to its simpler original approach since its processor doesn't use videos_kwargs. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 25d8e67bf..c93e4758f 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -170,14 +170,11 @@ class Qwen2VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True) - videos, video_metadata = _unpack_video_inputs(video_inputs) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, - videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos=video_inputs, padding=True, return_tensors="pt", ).to(self.device) @@ -222,20 +219,17 @@ class Qwen2VL(BaseVLM): all_messages.append(messages) all_texts = [] - all_video_tuples = [] + all_video_inputs = [] for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) - all_video_tuples.extend(video_inputs or []) + all_video_inputs.extend(video_inputs or []) - videos, video_metadata = _unpack_video_inputs(all_video_tuples or None) inputs = self.processor( text=all_texts, - videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos=all_video_inputs or None, padding=True, return_tensors="pt", ).to(self.device) @@ -350,8 +344,10 @@ class Qwen3VL(BaseVLM): text=[text], images=image_inputs, videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos_kwargs={ + "video_metadata": video_metadata, + "do_sample_frames": False, + }, padding=True, return_tensors="pt", ).to(self.device) @@ -407,8 +403,10 @@ class Qwen3VL(BaseVLM): inputs = self.processor( text=all_texts, videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos_kwargs={ + "video_metadata": video_metadata, + "do_sample_frames": False, + }, padding=True, return_tensors="pt", ).to(self.device) @@ -517,8 +515,10 @@ class Qwen35VL(BaseVLM): text=[text], images=image_inputs, videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos_kwargs={ + "video_metadata": video_metadata, + "do_sample_frames": False, + }, padding=True, return_tensors="pt", ).to(self.device) @@ -575,8 +575,10 @@ class Qwen35VL(BaseVLM): inputs = self.processor( text=all_texts, videos=videos, - video_metadata=video_metadata, - do_sample_frames=False, + videos_kwargs={ + "video_metadata": video_metadata, + "do_sample_frames": False, + }, padding=True, return_tensors="pt", ).to(self.device)