mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 19:49:49 +00:00
fix: use do_sample_frames=False instead of video_kwargs fps list
The Qwen3.5 processor expects fps as a scalar, not a list, so passing video_kwargs with fps=[...] fails validation. Since process_vision_info already handles frame sampling, we only need do_sample_frames=False to tell the processor to use the pre-sampled frames as-is. Made-with: Cursor
This commit is contained in:
@@ -159,14 +159,12 @@ class Qwen2VL(BaseVLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=video_inputs,
|
||||||
**video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -213,23 +211,19 @@ class Qwen2VL(BaseVLM):
|
|||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_image_inputs = []
|
||||||
all_video_inputs = []
|
all_video_inputs = []
|
||||||
all_video_kwargs: dict = {"do_sample_frames": False, "fps": []}
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_image_inputs.extend(image_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
all_video_inputs.extend(video_inputs or [])
|
||||||
all_video_kwargs["fps"].extend(video_kwargs.get("fps", []))
|
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
images=all_image_inputs if all_image_inputs else None,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
videos=all_video_inputs if all_video_inputs else None,
|
||||||
**all_video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -338,14 +332,12 @@ class Qwen3VL(BaseVLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=video_inputs,
|
||||||
**video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -391,23 +383,19 @@ class Qwen3VL(BaseVLM):
|
|||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_image_inputs = []
|
||||||
all_video_inputs = []
|
all_video_inputs = []
|
||||||
all_video_kwargs: dict = {"do_sample_frames": False, "fps": []}
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_image_inputs.extend(image_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
all_video_inputs.extend(video_inputs or [])
|
||||||
all_video_kwargs["fps"].extend(video_kwargs.get("fps", []))
|
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
images=all_image_inputs if all_image_inputs else None,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
videos=all_video_inputs if all_video_inputs else None,
|
||||||
**all_video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -510,14 +498,12 @@ class Qwen35VL(BaseVLM):
|
|||||||
text = self.processor.apply_chat_template(
|
text = self.processor.apply_chat_template(
|
||||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||||
)
|
)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=video_inputs,
|
||||||
**video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -562,25 +548,21 @@ class Qwen35VL(BaseVLM):
|
|||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_image_inputs = []
|
||||||
all_video_inputs = []
|
all_video_inputs = []
|
||||||
all_video_kwargs: dict = {"do_sample_frames": False, "fps": []}
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(
|
text = self.processor.apply_chat_template(
|
||||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||||
)
|
)
|
||||||
image_inputs, video_inputs, video_kwargs = self.process_vision_info(
|
image_inputs, video_inputs = self.process_vision_info(messages)
|
||||||
messages, return_video_kwargs=True
|
|
||||||
)
|
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_image_inputs.extend(image_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
all_video_inputs.extend(video_inputs or [])
|
||||||
all_video_kwargs["fps"].extend(video_kwargs.get("fps", []))
|
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
images=all_image_inputs if all_image_inputs else None,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
videos=all_video_inputs if all_video_inputs else None,
|
||||||
**all_video_kwargs,
|
do_sample_frames=False,
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
|||||||
Reference in New Issue
Block a user