add RoPE attention module as this is shown to help training dynamics and generation quality for DiTs

2026-07-23 17:56:07 +00:00 · 2025-12-09 08:42:56 -08:00
parent a0d5a088e3
commit 46ebcc2f7d
2 changed files with 183 additions and 7 deletions
@@ -167,9 +167,13 @@ class TransformerConfig:
    num_layers: int = 6  # Number of transformer layers
    num_heads: int = 8  # Number of attention heads
    dropout: float = 0.1  # Dropout rate
-    use_positional_encoding: bool = True  # Whether to use positional encoding
+    use_positional_encoding: bool = False  # Whether to use absolute positional encoding
    diffusion_step_embed_dim: int = 256  # Timestep embedding size
    # RoPE (Rotary Position Embedding) configuration
    use_rope: bool = True  # Whether to use Rotary Position Embedding in attention (baseline is True)
    rope_base: float = 10000.0  # Base frequency for RoPE computation
    def __post_init__(self):
        """Validate Transformer-specific parameters."""
        if self.hidden_dim <= 0:
@@ -71,6 +71,146 @@ class SinusoidalPosEmb(nn.Module):
        return emb
 class RotaryPositionalEmbedding(nn.Module):
    """Rotary Position Embedding (RoPE) for transformers.
    RoPE encodes position information by rotating query and key vectors,
    which naturally captures relative positions through the dot product.
    Applied at every attention layer rather than once at input.
    To do this, we need to reimplement the attention mechanism to apply RoPE
    to Q and K before computing the attention scores, so we cannot use the
    the built-in MultiheadAttention module.
    Original RoPE Paper: https://arxiv.org/abs/2104.09864 (RoFormer)
    """
    def __init__(self, head_dim: int, max_seq_len: int = 512, base: float = 10000.0):
        super().__init__()
        assert head_dim % 2 == 0, "head_dim must be even for RoPE"
        self.head_dim = head_dim
        self.max_seq_len = max_seq_len
        self.base = base
        # Precompute inverse frequencies: theta_i = 1 / (base^(2i/d))
        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self._precompute_cache(max_seq_len)
    def _precompute_cache(self, seq_len: int):
        t = torch.arange(seq_len, dtype=self.inv_freq.dtype)
        freqs = torch.outer(t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("_cos_cached", emb.cos()[None, None, :, :], persistent=False)
        self.register_buffer("_sin_cached", emb.sin()[None, None, :, :], persistent=False)
    def _rotate_half(self, x: Tensor) -> Tensor:
        """Rotate half the hidden dims of the input.
        For x = [x1, x2], returns [-x2, x1]
        """
        x1 = x[..., : x.shape[-1] // 2]
        x2 = x[..., x.shape[-1] // 2 :]
        return torch.cat((-x2, x1), dim=-1)
    def forward(self, q: Tensor, k: Tensor) -> tuple[Tensor, Tensor]:
        """Apply rotary embeddings to query and key tensors."""
        seq_len = q.shape[2]
        if seq_len > self.max_seq_len:
            raise ValueError(
                f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}. "
                f"Increase max_seq_len in RoPE config."
            )
        # Slice precomputed cache to actual sequence length
        cos = self._cos_cached[:, :, :seq_len, :].to(q.dtype)
        sin = self._sin_cached[:, :, :seq_len, :].to(q.dtype)
        # Apply rotation: q_rot = q * cos + rotate_half(q) * sin
        q_rotated = (q * cos) + (self._rotate_half(q) * sin)
        k_rotated = (k * cos) + (self._rotate_half(k) * sin)
        return q_rotated, k_rotated
 class RoPEAttention(nn.Module):
    """Multi-head self-attention with Rotary Position Embedding (RoPE).
    Custom attention implementation that applies RoPE to Q and K before
    computing attention scores. This allows position information to be
    encoded at every attention layer.
    """
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        dropout: float = 0.0,
        max_seq_len: int = 512,
        rope_base: float = 10000.0,
    ):
        """
        Args:
            hidden_size: Total hidden dimension
            num_heads: Number of attention heads
            dropout: Attention dropout rate
            max_seq_len: Maximum sequence length for RoPE cache
            rope_base: Base for RoPE frequency computation
        """
        super().__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scale = self.head_dim**-0.5
        self.qkv_proj = nn.Linear(hidden_size, 3 * hidden_size, bias=True)
        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=True)
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
        self.rope = RotaryPositionalEmbedding(head_dim=self.head_dim, max_seq_len=max_seq_len, base=rope_base)
    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: (B, T, hidden_size) input sequence
        Returns:
            (B, T, hidden_size) attention output
        """
        B, T, _ = x.shape  # noqa: N806
        # Compute Q, K, V
        qkv = self.qkv_proj(x)  # (B, T, 3 * hidden_size)
        qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, B, num_heads, T, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]  # Each: (B, num_heads, T, head_dim)
        # Apply RoPE to Q and K
        q, k = self.rope(q, k)
        # Scaled dot-product attention
        # Using PyTorch's efficient attention when available
        attn_out = torch.nn.functional.scaled_dot_product_attention(
            q,
            k,
            v,
            dropout_p=self.dropout.p if isinstance(self.dropout, nn.Dropout) and self.training else 0.0,
        )  # (B, num_heads, T, head_dim)
        # Reshape and project output
        attn_out = attn_out.transpose(1, 2).reshape(B, T, self.hidden_size)  # (B, T, hidden_size)
        output = self.out_proj(attn_out)
        return output
 class TransformerBlock(nn.Module):
    """DiT-style transformer block with AdaLN-Zero.
@@ -78,11 +218,20 @@ class TransformerBlock(nn.Module):
    - shift_msa, scale_msa, gate_msa: for attention block
    - shift_mlp, scale_mlp, gate_mlp: for MLP block
    Supports both standard attention and RoPE attention.
    Reference: https://github.com/facebookresearch/DiT
    """
    def __init__(
-        self, hidden_size: int = 128, num_heads: int = 4, num_features: int = 128, dropout: float = 0.0
+        self,
        hidden_size: int = 128,
        num_heads: int = 4,
        num_features: int = 128,
        dropout: float = 0.0,
        use_rope: bool = False,
        max_seq_len: int = 512,
        rope_base: float = 10000.0,
    ):
        """
        Args:
@@ -90,12 +239,26 @@ class TransformerBlock(nn.Module):
            num_heads: Number of attention heads
            num_features: Size of conditioning features
            dropout: Dropout rate
            use_rope: Whether to use Rotary Position Embedding
            max_seq_len: Maximum sequence length (for RoPE cache)
            rope_base: Base frequency for RoPE
        """
        super().__init__()
-        self.multihead_attn = nn.MultiheadAttention(
+        self.use_rope = use_rope
-            hidden_size, num_heads=num_heads, batch_first=True, dropout=dropout
+
-        )
+        if use_rope:
            self.attn = RoPEAttention(
                hidden_size=hidden_size,
                num_heads=num_heads,
                dropout=dropout,
                max_seq_len=max_seq_len,
                rope_base=rope_base,
            )
        else:
            self.multihead_attn = nn.MultiheadAttention(
                hidden_size, num_heads=num_heads, batch_first=True, dropout=dropout
            )
        # Layer normalizations (no learnable affine parameters, all adaptation via conditioning)
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
@@ -128,7 +291,12 @@ class TransformerBlock(nn.Module):
        # Attention block: norm → modulate → attn → gate × output → residual
        # modulate requires unsqueeze(1) to add sequence dimension for broadcasting
        attn_input = modulate(self.norm1(x), shift_msa.unsqueeze(1), scale_msa.unsqueeze(1))
-        attn_out, _ = self.multihead_attn(attn_input, attn_input, attn_input)
+
        if self.use_rope:
            attn_out = self.attn(attn_input)
        else:
            attn_out, _ = self.multihead_attn(attn_input, attn_input, attn_input)
        x = x + gate_msa.unsqueeze(1) * attn_out
        # MLP block: norm → modulate → mlp → gate × output → residual
@@ -163,6 +331,7 @@ class DiffusionTransformer(nn.Module):
        self.num_layers = self.transformer_config.num_layers
        self.num_heads = self.transformer_config.num_heads
        self.dropout = self.transformer_config.dropout
        self.use_rope = self.transformer_config.use_rope
        self.timestep_embed_dim = self.transformer_config.diffusion_step_embed_dim
        self.time_mlp = nn.Sequential(
@@ -179,7 +348,7 @@ class DiffusionTransformer(nn.Module):
        self.input_proj = nn.Linear(self.action_dim, self.hidden_size)
        if self.transformer_config.use_positional_encoding:
-            # Learnable positional embeddings for sequence positions
+            # Learnable positional embeddings for sequence positions (absolute encoding)
            self.pos_embedding = nn.Parameter(
                torch.empty(1, self.horizon, self.hidden_size).normal_(std=0.02)
            )
@@ -193,6 +362,9 @@ class DiffusionTransformer(nn.Module):
                    num_heads=self.num_heads,
                    num_features=self.cond_dim,
                    dropout=self.dropout,
                    use_rope=self.use_rope,
                    max_seq_len=self.horizon,  # This remains fixed because we aren't generating variable length sequences
                    rope_base=getattr(self.transformer_config, "rope_base", 10000.0),
                )
                for _ in range(self.num_layers)
            ]