diff --git a/pyproject.toml b/pyproject.toml index 1390fae30..437ed4762 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -242,6 +242,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401", "F403"] +"src/lerobot/policies/wall_x/**" = ["N801", "N812", "SIM102", "SIM108", "SIM210", "SIM211", "B006", "B007", "SIM118"] # Supprese these as they are coming from original Qwen2_5_vl code TODO(pepijn): refactor original [tool.ruff.lint.isort] combine-as-imports = true diff --git a/src/lerobot/policies/__init__.py b/src/lerobot/policies/__init__.py index a0214a100..b5143d1af 100644 --- a/src/lerobot/policies/__init__.py +++ b/src/lerobot/policies/__init__.py @@ -21,8 +21,8 @@ from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig from .smolvla.processor_smolvla import SmolVLANewLineProcessor from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig -from .xvla.configuration_xvla import XVLAConfig as XVLAConfig from .wall_x.configuration_wall_x import WallXConfig as WallXConfig +from .xvla.configuration_xvla import XVLAConfig as XVLAConfig __all__ = [ "ACTConfig", diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 7af9086d1..fd5394f16 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -41,8 +41,8 @@ from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.utils import validate_visual_features_consistency from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig -from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.policies.wall_x.configuration_wall_x import WallXConfig +from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.processor import PolicyAction, PolicyProcessorPipeline from lerobot.processor.converters import ( batch_to_transition, @@ -361,7 +361,7 @@ def make_pre_post_processors( config=policy_cfg, dataset_stats=kwargs.get("dataset_stats"), ) - + elif isinstance(policy_cfg, WallXConfig): from lerobot.policies.wall_x.processor_wall_x import make_wall_x_pre_post_processors diff --git a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py index 2a3e5eac5..490e25095 100644 --- a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py +++ b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py @@ -681,7 +681,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim Explanation: Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For - vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately. + vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately. Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, height and width) of text embedding is always the same, so the text embedding rotary position embedding has no @@ -779,7 +779,8 @@ class Qwen2_5_VLAttention(nn.Module): output_attentions: bool = False, use_cache: bool = False, cache_position: torch.LongTensor | None = None, - position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC + position_embeddings: tuple[torch.Tensor, torch.Tensor] + | None = None, # necessary, but kept here for BC ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: bsz, q_len, _ = hidden_states.size() @@ -858,7 +859,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention): super().__init__(*args, **kwargs) # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() @@ -871,7 +872,8 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention): output_attentions: bool = False, use_cache: bool = False, cache_position: torch.LongTensor | None = None, - position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC + position_embeddings: tuple[torch.Tensor, torch.Tensor] + | None = None, # necessary, but kept here for BC ): bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) @@ -965,7 +967,8 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention): output_attentions: bool = False, use_cache: bool = False, cache_position: torch.LongTensor | None = None, - position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC + position_embeddings: tuple[torch.Tensor, torch.Tensor] + | None = None, # necessary, but kept here for BC ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -1077,7 +1080,8 @@ class Qwen2_5_VLDecoderLayer(nn.Module): output_attentions: bool | None = False, use_cache: bool | None = False, cache_position: torch.LongTensor | None = None, - position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, # necessary, but kept here for BC + position_embeddings: tuple[torch.Tensor, torch.Tensor] + | None = None, # necessary, but kept here for BC **kwargs, ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]: """ @@ -1618,7 +1622,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi width position_ids: [0, 1, 2, 3, 4] For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part - and 1D rotary position embeddin for text part. + and 1D rotary position embedding for text part. Examples: Temporal (Time): 3 patches, representing different segments of the video in time. Height: 2 patches, dividing each frame vertically. @@ -2728,7 +2732,7 @@ class Qwen2_5_VLMoEModel(Qwen2_5_VLPreTrainedModel): dtype (`torch.dtype`): The dtype to use for the 4D attention mask. device (`torch.device`): - The device to plcae the 4D attention mask on. + The device to place the 4D attention mask on. cache_position (`torch.Tensor`): Indices depicting the position of the input sequence tokens in the sequence. batch_size (`torch.Tensor`): diff --git a/src/lerobot/policies/wall_x/utils.py b/src/lerobot/policies/wall_x/utils.py index bada4ebdf..2ea40b377 100644 --- a/src/lerobot/policies/wall_x/utils.py +++ b/src/lerobot/policies/wall_x/utils.py @@ -565,7 +565,8 @@ def get_action_tokens(normalized_actions: torch.Tensor | list, action_tokenizer) def pad_action_token_strs( - actions_token_lists: list[list[str]], pad_token: str = "<|endoftext|>" + actions_token_lists: list[list[str]], + pad_token: str = "<|endoftext|>", # nosec B107 ) -> list[str]: """Pad action token lists to same length and join as strings.