mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-19 02:29:47 +00:00
fix: pass video_metadata via process_vision_info for correct position embeddings
The Qwen3.5 processor needs video_metadata (fps, frame indices) to
compute temporal position embeddings. Use return_video_metadata=True
which embeds metadata inside the video tensors as (tensor, metadata)
tuples, and return_video_kwargs=True which returns {'do_sample_frames':
False} without the problematic fps list.
Made-with: Cursor
This commit is contained in:
@@ -159,12 +159,14 @@ class Qwen2VL(BaseVLM):
|
||||
]
|
||||
|
||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
@@ -214,7 +216,9 @@ class Qwen2VL(BaseVLM):
|
||||
|
||||
for messages in all_messages:
|
||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
all_texts.append(text)
|
||||
all_image_inputs.extend(image_inputs or [])
|
||||
all_video_inputs.extend(video_inputs or [])
|
||||
@@ -223,7 +227,7 @@ class Qwen2VL(BaseVLM):
|
||||
text=all_texts,
|
||||
images=all_image_inputs if all_image_inputs else None,
|
||||
videos=all_video_inputs if all_video_inputs else None,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
@@ -332,12 +336,14 @@ class Qwen3VL(BaseVLM):
|
||||
]
|
||||
|
||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
@@ -386,7 +392,9 @@ class Qwen3VL(BaseVLM):
|
||||
|
||||
for messages in all_messages:
|
||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
all_texts.append(text)
|
||||
all_image_inputs.extend(image_inputs or [])
|
||||
all_video_inputs.extend(video_inputs or [])
|
||||
@@ -395,7 +403,7 @@ class Qwen3VL(BaseVLM):
|
||||
text=all_texts,
|
||||
images=all_image_inputs if all_image_inputs else None,
|
||||
videos=all_video_inputs if all_video_inputs else None,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
@@ -498,12 +506,14 @@ class Qwen35VL(BaseVLM):
|
||||
text = self.processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||
)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
@@ -553,7 +563,9 @@ class Qwen35VL(BaseVLM):
|
||||
text = self.processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||
)
|
||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
||||
messages, return_video_kwargs=True, return_video_metadata=True
|
||||
)
|
||||
all_texts.append(text)
|
||||
all_image_inputs.extend(image_inputs or [])
|
||||
all_video_inputs.extend(video_inputs or [])
|
||||
@@ -562,7 +574,7 @@ class Qwen35VL(BaseVLM):
|
||||
text=all_texts,
|
||||
images=all_image_inputs if all_image_inputs else None,
|
||||
videos=all_video_inputs if all_video_inputs else None,
|
||||
do_sample_frames=False,
|
||||
**video_kwargs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
).to(self.device)
|
||||
|
||||
Reference in New Issue
Block a user