From 002a9dd0b9dae111bca35bd3e34956ea44d85320 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 16:55:46 +0200 Subject: [PATCH] fix: use do_sample_frames=False instead of video_kwargs fps list The Qwen3.5 processor expects fps as a scalar, not a list, so passing video_kwargs with fps=[...] fails validation. Since process_vision_info already handles frame sampling, we only need do_sample_frames=False to tell the processor to use the pre-sampled frames as-is. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 42 ++++++------------- 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 64bb3fbec..3ba12c919 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -159,14 +159,12 @@ class Qwen2VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device) @@ -213,23 +211,19 @@ class Qwen2VL(BaseVLM): all_texts = [] all_image_inputs = [] all_video_inputs = [] - all_video_kwargs: dict = {"do_sample_frames": False, "fps": []} for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) - all_video_kwargs["fps"].extend(video_kwargs.get("fps", [])) inputs = self.processor( text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **all_video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device) @@ -338,14 +332,12 @@ class Qwen3VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device) @@ -391,23 +383,19 @@ class Qwen3VL(BaseVLM): all_texts = [] all_image_inputs = [] all_video_inputs = [] - all_video_kwargs: dict = {"do_sample_frames": False, "fps": []} for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) - all_video_kwargs["fps"].extend(video_kwargs.get("fps", [])) inputs = self.processor( text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **all_video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device) @@ -510,14 +498,12 @@ class Qwen35VL(BaseVLM): text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device) @@ -562,25 +548,21 @@ class Qwen35VL(BaseVLM): all_texts = [] all_image_inputs = [] all_video_inputs = [] - all_video_kwargs: dict = {"do_sample_frames": False, "fps": []} for messages in all_messages: text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) - all_video_kwargs["fps"].extend(video_kwargs.get("fps", [])) inputs = self.processor( text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **all_video_kwargs, + do_sample_frames=False, padding=True, return_tensors="pt", ).to(self.device)