From 72692525dac1567d6d38772fa4a664e500301b15 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 30 Mar 2026 17:32:30 +0200 Subject: [PATCH] fix: pass fps=1.0 scalar to processor instead of video_metadata tuples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return_video_metadata=True approach causes 'list index out of range' due to (tensor, metadata) tuple format issues. Since all extracted videos are at 1fps (ffmpeg -r 1), directly pass fps=1.0 as a scalar alongside do_sample_frames=False — this gives the processor the exact fps for position embedding computation without format compatibility issues across Qwen processor versions. Made-with: Cursor --- .../data_annotations/vlm_annotations.py | 42 ++++++++----------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/src/lerobot/data_processing/data_annotations/vlm_annotations.py b/src/lerobot/data_processing/data_annotations/vlm_annotations.py index 7f4c47781..4dd019ab7 100644 --- a/src/lerobot/data_processing/data_annotations/vlm_annotations.py +++ b/src/lerobot/data_processing/data_annotations/vlm_annotations.py @@ -159,14 +159,13 @@ class Qwen2VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device) @@ -216,9 +215,7 @@ class Qwen2VL(BaseVLM): for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -227,7 +224,8 @@ class Qwen2VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device) @@ -336,14 +334,13 @@ class Qwen3VL(BaseVLM): ] text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device) @@ -392,9 +389,7 @@ class Qwen3VL(BaseVLM): for messages in all_messages: text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -403,7 +398,8 @@ class Qwen3VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device) @@ -506,14 +502,13 @@ class Qwen35VL(BaseVLM): text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device) @@ -563,9 +558,7 @@ class Qwen35VL(BaseVLM): text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) - image_inputs, video_inputs, video_kwargs = self.process_vision_info( - messages, return_video_kwargs=True, return_video_metadata=True - ) + image_inputs, video_inputs = self.process_vision_info(messages) all_texts.append(text) all_image_inputs.extend(image_inputs or []) all_video_inputs.extend(video_inputs or []) @@ -574,7 +567,8 @@ class Qwen35VL(BaseVLM): text=all_texts, images=all_image_inputs if all_image_inputs else None, videos=all_video_inputs if all_video_inputs else None, - **video_kwargs, + do_sample_frames=False, + fps=1.0, padding=True, return_tensors="pt", ).to(self.device)