fix precommit issues

This commit is contained in:
Pepijn
2025-12-17 15:16:03 +01:00
committed by Michel Aractingi
parent 9ae4477356
commit c514d9ffe2
5 changed files with 18 additions and 12 deletions
+1
View File
@@ -242,6 +242,7 @@ ignore = [
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401", "F403"]
"src/lerobot/policies/wall_x/**" = ["N801", "N812", "SIM102", "SIM108", "SIM210", "SIM211", "B006", "B007", "SIM118"] # Supprese these as they are coming from original Qwen2_5_vl code TODO(pepijn): refactor original
[tool.ruff.lint.isort]
combine-as-imports = true
+1 -1
View File
@@ -21,8 +21,8 @@ from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
from .smolvla.processor_smolvla import SmolVLANewLineProcessor
from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
from .wall_x.configuration_wall_x import WallXConfig as WallXConfig
from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
__all__ = [
"ACTConfig",
+1 -1
View File
@@ -41,8 +41,8 @@ from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.policies.utils import validate_visual_features_consistency
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
from lerobot.policies.wall_x.configuration_wall_x import WallXConfig
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
from lerobot.processor.converters import (
batch_to_transition,
@@ -681,7 +681,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
Explanation:
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
@@ -779,7 +779,8 @@ class Qwen2_5_VLAttention(nn.Module):
output_attentions: bool = False,
use_cache: bool = False,
cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
bsz, q_len, _ = hidden_states.size()
@@ -858,7 +859,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
@@ -871,7 +872,8 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
output_attentions: bool = False,
use_cache: bool = False,
cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
):
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
@@ -965,7 +967,8 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
output_attentions: bool = False,
use_cache: bool = False,
cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -1077,7 +1080,8 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
output_attentions: bool | None = False,
use_cache: bool | None = False,
cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC
position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
**kwargs,
) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
"""
@@ -1618,7 +1622,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
width position_ids: [0, 1, 2, 3, 4]
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
and 1D rotary position embeddin for text part.
and 1D rotary position embedding for text part.
Examples:
Temporal (Time): 3 patches, representing different segments of the video in time.
Height: 2 patches, dividing each frame vertically.
@@ -2728,7 +2732,7 @@ class Qwen2_5_VLMoEModel(Qwen2_5_VLPreTrainedModel):
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
device (`torch.device`):
The device to plcae the 4D attention mask on.
The device to place the 4D attention mask on.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
+2 -1
View File
@@ -565,7 +565,8 @@ def get_action_tokens(normalized_actions: torch.Tensor | list, action_tokenizer)
def pad_action_token_strs(
actions_token_lists: list[list[str]], pad_token: str = "<|endoftext|>"
actions_token_lists: list[list[str]],
pad_token: str = "<|endoftext|>", # nosec B107
) -> list[str]:
"""Pad action token lists to same length and join as strings.