fix precommit issues

This commit is contained in:
Pepijn
2025-12-17 15:16:03 +01:00
committed by Michel Aractingi
parent 9ae4477356
commit c514d9ffe2
5 changed files with 18 additions and 12 deletions
+1
View File
@@ -242,6 +242,7 @@ ignore = [
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401", "F403"] "__init__.py" = ["F401", "F403"]
"src/lerobot/policies/wall_x/**" = ["N801", "N812", "SIM102", "SIM108", "SIM210", "SIM211", "B006", "B007", "SIM118"] # Supprese these as they are coming from original Qwen2_5_vl code TODO(pepijn): refactor original
[tool.ruff.lint.isort] [tool.ruff.lint.isort]
combine-as-imports = true combine-as-imports = true
+1 -1
View File
@@ -21,8 +21,8 @@ from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
from .smolvla.processor_smolvla import SmolVLANewLineProcessor from .smolvla.processor_smolvla import SmolVLANewLineProcessor
from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
from .wall_x.configuration_wall_x import WallXConfig as WallXConfig from .wall_x.configuration_wall_x import WallXConfig as WallXConfig
from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
__all__ = [ __all__ = [
"ACTConfig", "ACTConfig",
+2 -2
View File
@@ -41,8 +41,8 @@ from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.policies.utils import validate_visual_features_consistency from lerobot.policies.utils import validate_visual_features_consistency
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
from lerobot.policies.wall_x.configuration_wall_x import WallXConfig from lerobot.policies.wall_x.configuration_wall_x import WallXConfig
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
from lerobot.processor import PolicyAction, PolicyProcessorPipeline from lerobot.processor import PolicyAction, PolicyProcessorPipeline
from lerobot.processor.converters import ( from lerobot.processor.converters import (
batch_to_transition, batch_to_transition,
@@ -361,7 +361,7 @@ def make_pre_post_processors(
config=policy_cfg, config=policy_cfg,
dataset_stats=kwargs.get("dataset_stats"), dataset_stats=kwargs.get("dataset_stats"),
) )
elif isinstance(policy_cfg, WallXConfig): elif isinstance(policy_cfg, WallXConfig):
from lerobot.policies.wall_x.processor_wall_x import make_wall_x_pre_post_processors from lerobot.policies.wall_x.processor_wall_x import make_wall_x_pre_post_processors
@@ -681,7 +681,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
Explanation: Explanation:
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately. vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
@@ -779,7 +779,8 @@ class Qwen2_5_VLAttention(nn.Module):
output_attentions: bool = False, output_attentions: bool = False,
use_cache: bool = False, use_cache: bool = False,
cache_position: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
bsz, q_len, _ = hidden_states.size() bsz, q_len, _ = hidden_states.size()
@@ -858,7 +859,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
@@ -871,7 +872,8 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
output_attentions: bool = False, output_attentions: bool = False,
use_cache: bool = False, use_cache: bool = False,
cache_position: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
): ):
bsz, q_len, _ = hidden_states.size() bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states) query_states = self.q_proj(hidden_states)
@@ -965,7 +967,8 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
output_attentions: bool = False, output_attentions: bool = False,
use_cache: bool = False, use_cache: bool = False,
cache_position: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
if output_attentions: if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -1077,7 +1080,8 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
output_attentions: bool | None = False, output_attentions: bool | None = False,
use_cache: bool | None = False, use_cache: bool | None = False,
cache_position: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None,
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC position_embeddings: tuple[torch.Tensor, torch.Tensor]
| None = None, # necessary, but kept here for BC
**kwargs, **kwargs,
) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]: ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
""" """
@@ -1618,7 +1622,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
width position_ids: [0, 1, 2, 3, 4] width position_ids: [0, 1, 2, 3, 4]
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
and 1D rotary position embeddin for text part. and 1D rotary position embedding for text part.
Examples: Examples:
Temporal (Time): 3 patches, representing different segments of the video in time. Temporal (Time): 3 patches, representing different segments of the video in time.
Height: 2 patches, dividing each frame vertically. Height: 2 patches, dividing each frame vertically.
@@ -2728,7 +2732,7 @@ class Qwen2_5_VLMoEModel(Qwen2_5_VLPreTrainedModel):
dtype (`torch.dtype`): dtype (`torch.dtype`):
The dtype to use for the 4D attention mask. The dtype to use for the 4D attention mask.
device (`torch.device`): device (`torch.device`):
The device to plcae the 4D attention mask on. The device to place the 4D attention mask on.
cache_position (`torch.Tensor`): cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence. Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`): batch_size (`torch.Tensor`):
+2 -1
View File
@@ -565,7 +565,8 @@ def get_action_tokens(normalized_actions: torch.Tensor | list, action_tokenizer)
def pad_action_token_strs( def pad_action_token_strs(
actions_token_lists: list[list[str]], pad_token: str = "<|endoftext|>" actions_token_lists: list[list[str]],
pad_token: str = "<|endoftext|>", # nosec B107
) -> list[str]: ) -> list[str]:
"""Pad action token lists to same length and join as strings. """Pad action token lists to same length and join as strings.