|
|
@@ -681,7 +681,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
|
|
|
|
Explanation:
|
|
|
|
Explanation:
|
|
|
|
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
|
|
|
|
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
|
|
|
|
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
|
|
|
|
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
|
|
|
|
vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
|
|
|
|
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
|
|
|
|
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
|
|
|
|
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
|
|
|
|
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
|
|
|
|
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
|
|
|
|
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
|
|
|
|
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
|
|
|
@@ -779,7 +779,8 @@ class Qwen2_5_VLAttention(nn.Module):
|
|
|
|
output_attentions: bool = False,
|
|
|
|
output_attentions: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor]
|
|
|
|
|
|
|
|
| None = None, # necessary, but kept here for BC
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
|
|
|
|
bsz, q_len, _ = hidden_states.size()
|
|
|
|
bsz, q_len, _ = hidden_states.size()
|
|
|
|
|
|
|
|
|
|
|
@@ -858,7 +859,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
|
|
|
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
|
|
|
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
|
|
|
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
|
|
|
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
|
|
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
|
|
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
|
|
|
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
|
|
|
|
|
|
|
|
|
|
@@ -871,7 +872,8 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
|
|
output_attentions: bool = False,
|
|
|
|
output_attentions: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor]
|
|
|
|
|
|
|
|
| None = None, # necessary, but kept here for BC
|
|
|
|
):
|
|
|
|
):
|
|
|
|
bsz, q_len, _ = hidden_states.size()
|
|
|
|
bsz, q_len, _ = hidden_states.size()
|
|
|
|
query_states = self.q_proj(hidden_states)
|
|
|
|
query_states = self.q_proj(hidden_states)
|
|
|
@@ -965,7 +967,8 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
|
|
|
|
output_attentions: bool = False,
|
|
|
|
output_attentions: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
use_cache: bool = False,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor]
|
|
|
|
|
|
|
|
| None = None, # necessary, but kept here for BC
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
|
|
|
|
if output_attentions:
|
|
|
|
if output_attentions:
|
|
|
|
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
|
|
|
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
|
|
@@ -1077,7 +1080,8 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
|
|
|
|
output_attentions: bool | None = False,
|
|
|
|
output_attentions: bool | None = False,
|
|
|
|
use_cache: bool | None = False,
|
|
|
|
use_cache: bool | None = False,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
cache_position: torch.LongTensor | None = None,
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
|
|
|
|
position_embeddings: tuple[torch.Tensor, torch.Tensor]
|
|
|
|
|
|
|
|
| None = None, # necessary, but kept here for BC
|
|
|
|
**kwargs,
|
|
|
|
**kwargs,
|
|
|
|
) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
|
|
|
|
) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@@ -1618,7 +1622,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
|
width position_ids: [0, 1, 2, 3, 4]
|
|
|
|
width position_ids: [0, 1, 2, 3, 4]
|
|
|
|
|
|
|
|
|
|
|
|
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
|
|
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
|
|
and 1D rotary position embeddin for text part.
|
|
|
|
and 1D rotary position embedding for text part.
|
|
|
|
Examples:
|
|
|
|
Examples:
|
|
|
|
Temporal (Time): 3 patches, representing different segments of the video in time.
|
|
|
|
Temporal (Time): 3 patches, representing different segments of the video in time.
|
|
|
|
Height: 2 patches, dividing each frame vertically.
|
|
|
|
Height: 2 patches, dividing each frame vertically.
|
|
|
@@ -2728,7 +2732,7 @@ class Qwen2_5_VLMoEModel(Qwen2_5_VLPreTrainedModel):
|
|
|
|
dtype (`torch.dtype`):
|
|
|
|
dtype (`torch.dtype`):
|
|
|
|
The dtype to use for the 4D attention mask.
|
|
|
|
The dtype to use for the 4D attention mask.
|
|
|
|
device (`torch.device`):
|
|
|
|
device (`torch.device`):
|
|
|
|
The device to plcae the 4D attention mask on.
|
|
|
|
The device to place the 4D attention mask on.
|
|
|
|
cache_position (`torch.Tensor`):
|
|
|
|
cache_position (`torch.Tensor`):
|
|
|
|
Indices depicting the position of the input sequence tokens in the sequence.
|
|
|
|
Indices depicting the position of the input sequence tokens in the sequence.
|
|
|
|
batch_size (`torch.Tensor`):
|
|
|
|
batch_size (`torch.Tensor`):
|
|
|
|