mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 05:59:52 +00:00
fix: unpack video_metadata from tuples and pass separately to processor
The Qwen3.5 processor requires video_metadata as a separate parameter, not embedded in the video tensors. Use return_video_metadata=True from process_vision_info, then unpack the (tensor, metadata) tuples into separate videos and video_metadata lists for the processor call. Made-with: Cursor
This commit is contained in:
@@ -87,6 +87,17 @@ class BaseVLM(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _unpack_video_inputs(
|
||||||
|
video_inputs: list | None,
|
||||||
|
) -> tuple[list | None, list[dict] | None]:
|
||||||
|
"""Unpack (tensor, metadata) tuples returned by process_vision_info with return_video_metadata=True."""
|
||||||
|
if not video_inputs:
|
||||||
|
return None, None
|
||||||
|
videos = [v[0] for v in video_inputs]
|
||||||
|
metadata = [v[1] for v in video_inputs]
|
||||||
|
return videos, metadata
|
||||||
|
|
||||||
|
|
||||||
def create_skill_segmentation_prompt(
|
def create_skill_segmentation_prompt(
|
||||||
coarse_goal: str | None = None,
|
coarse_goal: str | None = None,
|
||||||
subtask_labels: list[str] | None = None,
|
subtask_labels: list[str] | None = None,
|
||||||
@@ -159,13 +170,14 @@ class Qwen2VL(BaseVLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
|
videos, video_metadata = _unpack_video_inputs(video_inputs)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=videos,
|
||||||
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -210,22 +222,20 @@ class Qwen2VL(BaseVLM):
|
|||||||
all_messages.append(messages)
|
all_messages.append(messages)
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_video_tuples = []
|
||||||
all_video_inputs = []
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_video_tuples.extend(video_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
|
||||||
|
|
||||||
|
videos, video_metadata = _unpack_video_inputs(all_video_tuples or None)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
videos=videos,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -334,13 +344,14 @@ class Qwen3VL(BaseVLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
|
videos, video_metadata = _unpack_video_inputs(video_inputs)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=videos,
|
||||||
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -384,22 +395,20 @@ class Qwen3VL(BaseVLM):
|
|||||||
all_messages.append(messages)
|
all_messages.append(messages)
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_video_tuples = []
|
||||||
all_video_inputs = []
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_video_tuples.extend(video_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
|
||||||
|
|
||||||
|
videos, video_metadata = _unpack_video_inputs(all_video_tuples or None)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
videos=videos,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -502,13 +511,14 @@ class Qwen35VL(BaseVLM):
|
|||||||
text = self.processor.apply_chat_template(
|
text = self.processor.apply_chat_template(
|
||||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||||
)
|
)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
|
videos, video_metadata = _unpack_video_inputs(video_inputs)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=[text],
|
text=[text],
|
||||||
images=image_inputs,
|
images=image_inputs,
|
||||||
videos=video_inputs,
|
videos=videos,
|
||||||
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
@@ -551,24 +561,22 @@ class Qwen35VL(BaseVLM):
|
|||||||
all_messages.append(messages)
|
all_messages.append(messages)
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
all_image_inputs = []
|
all_video_tuples = []
|
||||||
all_video_inputs = []
|
|
||||||
|
|
||||||
for messages in all_messages:
|
for messages in all_messages:
|
||||||
text = self.processor.apply_chat_template(
|
text = self.processor.apply_chat_template(
|
||||||
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
||||||
)
|
)
|
||||||
image_inputs, video_inputs = self.process_vision_info(messages)
|
image_inputs, video_inputs = self.process_vision_info(messages, return_video_metadata=True)
|
||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
all_image_inputs.extend(image_inputs or [])
|
all_video_tuples.extend(video_inputs or [])
|
||||||
all_video_inputs.extend(video_inputs or [])
|
|
||||||
|
|
||||||
|
videos, video_metadata = _unpack_video_inputs(all_video_tuples or None)
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=all_texts,
|
text=all_texts,
|
||||||
images=all_image_inputs if all_image_inputs else None,
|
videos=videos,
|
||||||
videos=all_video_inputs if all_video_inputs else None,
|
video_metadata=video_metadata,
|
||||||
do_sample_frames=False,
|
do_sample_frames=False,
|
||||||
fps=1.0,
|
|
||||||
padding=True,
|
padding=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
|||||||
Reference in New Issue
Block a user