Spaces:

OliverPerrin
/

LexiMind

Running

App Files Files Community

OliverPerrin commited on Oct 27

Commit

204fb3c

1 Parent(s): ba4cb76

Implemented ScaledDotProduct Attention and Multi-Head Attention

Browse files

Files changed (9) hide show

src/models/attention.py +113 -3
src/models/decoder.py +0 -0
src/models/encoder.py +0 -0
src/models/heads.py +0 -0
src/models/multitask.py +0 -0
src/models/positional_encoding.py +0 -0
tests/test_models/test_attention.py +89 -2
tests/test_models/test_attention_visual.py +53 -0
tests/test_models/test_multihead_visual.py +162 -0

src/models/attention.py CHANGED Viewed

@@ -5,6 +5,8 @@ This module implements the core attention mechanisms used in the Transformer mod
 - ScaledDotProductAttention: Fundamental attention operation
 - MultiHeadAttention: Parallel attention with learned projections
 Author: Oliver Perrin
 Date: 2025-10-23
 """
@@ -48,7 +50,7 @@ class ScaledDotProductAttention(nn.Module):
     def __init__(self):
         super().__init__()
-        # TODO: Do you need any parameters here?
         pass
     def forward(
@@ -69,7 +71,115 @@ class ScaledDotProductAttention(nn.Module):
         5. Compute output: output = attention_weights @ value
         6. Return both output and attention_weights
         """
-        pass
-# TODO: After you implement ScaledDotProductAttention, we'll add MultiHeadAttention

 - ScaledDotProductAttention: Fundamental attention operation
 - MultiHeadAttention: Parallel attention with learned projections
+Doing this first for Bottom-Up implementation of the Transformer
 Author: Oliver Perrin
 Date: 2025-10-23
 """
     def __init__(self):
         super().__init__()
+        # Params not needed here.
         pass
     def forward(
         5. Compute output: output = attention_weights @ value
         6. Return both output and attention_weights
         """
+        # Getting Dimension for Scaling
+        d_k = query.size(-1)
+        # Compute Attention Scores
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+        # Mask if provided
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float('-inf'))
+        # Applying Softmax to get attention weights
+        attention_weights = F.softmax(scores, dim=-1)
+        return torch.matmul(attention_weights, value), attention_weights
+# --------------- Multi-Head Attention ---------------
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-Head Attention mechanism.
+    Allows the model to jointly attend to information from different
+    representation subspaces at different positions.
+    Transforming the input into query, key, and value representations
+    Args:
+        d_model: Dimension of model (default: 512)
+        num_heads: Number of attention heads (default: 8)
+        dropout: Dropout probability (default: 0.1)
+    """
+    def __init__(self, d_model: int = 512, num_heads: int = 8, dropout: float = 0.1):
+        super().__init__()
+        # Assert that d_model is divisible by num_heads
+        # Why? Because d_k = d_model // num_heads must be an integer
+        assert d_model % num_heads == 0
+        # Assume d_v always equals d_k
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        # Create 4 linear layers (W_Q, W_K, W_V, W_O)
+        # All should be nn.Linear(d_model, d_model)
+        self.W_Q = nn.Linear(d_model, d_model)
+        self.W_K = nn.Linear(d_model, d_model)
+        self.W_V = nn.Linear(d_model, d_model)
+        self.W_O = nn.Linear(d_model, d_model)
+        # Create ScaledDotProductAttention instance
+        self.attention = ScaledDotProductAttention()
+        # Create dropout layer
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query: (batch, seq_len, d_model)
+            key: (batch, seq_len, d_model)
+            value: (batch, seq_len, d_model)
+            mask: Optional (batch, seq_len, seq_len) or (batch, 1, seq_len, seq_len)
+        Returns:
+            output: (batch, seq_len, d_model)
+            attention_weights: (batch, num_heads, seq_len, seq_len)
+        """
+        batch_size = query.size(0)
+        # Linear projections
+        Q = self.W_Q(query)  # (batch, seq_len, d_model)
+        K = self.W_K(key)
+        V = self.W_V(value)
+        # Split into heads
+        # Reshape from (batch, seq_len, d_model) to (batch, num_heads, seq_len, d_k), Apply to Q, K, V
+        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
+        # Now: (batch, num_heads, seq_len, d_k)
+        # Now all are: (batch=2, num_heads=8, seq_len=10, d_k=64)
+        # Handle mask broadcasting for multi-head attention
+        if mask is not None:
+            # If mask is 3D (batch, seq, seq), add head dimension
+            if mask.dim() == 3:
+                mask = mask.unsqueeze(1)  # (batch, 1, seq, seq)
+        # Now mask broadcasts across all heads: (batch, 1, seq, seq) → (batch, 8, seq, seq)
+        # Apply attention
+        output, attn_weights = self.attention(Q, K, V, mask)
+        # output: (batch, num_heads, seq_len, d_k)
+        # attn_weights: (batch, num_heads, seq_len, seq_len)
+        # Concatenate heads
+        # (batch, num_heads, seq_len, d_k) → (batch, seq_len, num_heads, d_k) → (batch, seq_len, d_model)
+        output = output.transpose(1, 2).contiguous()
+        output = output.view(batch_size, -1, self.d_model) # -1 in view means 'infer this dimension'
+        # After transpose, the tensor's memory layout
+        # is "scattered", contiguous() just reorganizes it in memory
+        # Final linear projection
+        output = self.W_O(output)
+        # Apply dropout
+        output = self.dropout(output)
+        return output, attn_weights

src/models/decoder.py ADDED Viewed

File without changes

src/models/encoder.py ADDED Viewed

File without changes

src/models/heads.py ADDED Viewed

File without changes

src/models/multitask.py ADDED Viewed

File without changes

src/models/positional_encoding.py ADDED Viewed

File without changes

tests/test_models/test_attention.py CHANGED Viewed

@@ -6,8 +6,7 @@ Run with: pytest tests/test_models/test_attention.py -v
 import pytest
 import torch
-from src.models.attention import ScaledDotProductAttention
 class TestScaledDotProductAttention:
     """Test suite for ScaledDotProductAttention."""
@@ -55,6 +54,94 @@ class TestScaledDotProductAttention:
         assert torch.allclose(weights[:, :, 3:], torch.zeros(batch_size, seq_len, 2), atol=1e-6)
     # TODO: Add more tests as you understand the mechanism better
 if __name__ == "__main__":

 import pytest
 import torch
+from src.models.attention import ScaledDotProductAttention, MultiHeadAttention
 class TestScaledDotProductAttention:
     """Test suite for ScaledDotProductAttention."""
         assert torch.allclose(weights[:, :, 3:], torch.zeros(batch_size, seq_len, 2), atol=1e-6)
     # TODO: Add more tests as you understand the mechanism better
+    class TestMultiHeadAttention:
+        """Test suite for MultiHeadAttention."""
+        def test_output_shape(self):
+            """Test that output shapes are correct."""
+            d_model, num_heads = 512, 8
+            batch_size, seq_len = 2, 10
+            mha = MultiHeadAttention(d_model, num_heads)
+            Q = K = V = torch.randn(batch_size, seq_len, d_model)
+            output, attn_weights = mha(Q, K, V)
+            assert output.shape == (batch_size, seq_len, d_model)
+            assert attn_weights.shape == (batch_size, num_heads, seq_len, seq_len)
+        def test_different_qkv(self):
+            """Test with different Q, K, V (cross-attention scenario)."""
+            d_model, num_heads = 512, 8
+            batch_size = 2
+            seq_len_q, seq_len_kv = 10, 20
+            mha = MultiHeadAttention(d_model, num_heads)
+            Q = torch.randn(batch_size, seq_len_q, d_model)
+            K = torch.randn(batch_size, seq_len_kv, d_model)
+            V = torch.randn(batch_size, seq_len_kv, d_model)
+            output, attn_weights = mha(Q, K, V)
+            # Output has same length as query
+            assert output.shape == (batch_size, seq_len_q, d_model)
+            # Attention is query_len x key_len
+            assert attn_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_kv)
+        def test_masking(self):
+            """Test that masking works correctly."""
+            d_model, num_heads = 512, 8
+            batch_size, seq_len = 2, 5
+            mha = MultiHeadAttention(d_model, num_heads)
+            Q = K = V = torch.randn(batch_size, seq_len, d_model)
+            # Mask out last 2 positions
+            mask = torch.ones(batch_size, seq_len, seq_len, dtype=torch.bool)
+            mask[:, :, -2:] = False
+            _, attn_weights = mha(Q, K, V, mask)
+            # Last 2 positions should have near-zero attention
+            assert torch.allclose(
+                attn_weights[:, :, :, -2:],
+                torch.zeros(batch_size, num_heads, seq_len, 2),
+                atol=1e-6
+            )
+        def test_parameters_exist(self):
+            """Test that learnable parameters are created."""
+            mha = MultiHeadAttention(512, 8)
+            # Should have 4 linear layers worth of parameters
+            param_names = [name for name, _ in mha.named_parameters()]
+            assert any('W_Q' in name or 'q_linear' in name.lower() for name in param_names)
+            assert any('W_K' in name or 'k_linear' in name.lower() for name in param_names)
+            assert any('W_V' in name or 'v_linear' in name.lower() for name in param_names)
+            assert any('W_O' in name or 'out' in name.lower() for name in param_names)
+        def test_dropout_changes_output(self):
+            """Test that dropout is actually applied during training."""
+            torch.manual_seed(42)
+            mha = MultiHeadAttention(512, 8, dropout=0.5)
+            mha.train()  # Enable training mode
+            Q = K = V = torch.randn(2, 10, 512)
+            # Run twice with same input - should get different outputs due to dropout
+            output1, _ = mha(Q, K, V)
+            output2, _ = mha(Q, K, V)
+            assert not torch.allclose(output1, output2)
+            # In eval mode, should be deterministic
+            mha.eval()
+            output3, _ = mha(Q, K, V)
+            output4, _ = mha(Q, K, V)
+            assert torch.allclose(output3, output4)
 if __name__ == "__main__":

tests/test_models/test_attention_visual.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Create a file: tests/test_models/test_attention_visual.py
+import torch
+import matplotlib.pyplot as plt
+import seaborn as sns
+from src.models.attention import ScaledDotProductAttention
+def test_attention_visualization():
+    """Visual test to understand attention patterns."""
+    attention = ScaledDotProductAttention()
+    # Create a simple case: 5 tokens, each token attends most to itself
+    batch_size = 1
+    seq_len = 5
+    d_k = 64
+    # Create Q, K, V
+    torch.manual_seed(42)
+    Q = torch.randn(batch_size, seq_len, d_k)
+    K = torch.randn(batch_size, seq_len, d_k)
+    V = torch.eye(seq_len, d_k).unsqueeze(0)  # Identity-like
+    # Compute attention
+    output, weights = attention(Q, K, V)
+    # Plot attention weights
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(
+        weights[0].detach().numpy(),
+        annot=True,
+        fmt='.2f',
+        cmap='viridis',
+        xticklabels=[f'Key {i}' for i in range(seq_len)],
+        yticklabels=[f'Query {i}' for i in range(seq_len)]
+    )
+    plt.title('Attention Weights Heatmap')
+    plt.xlabel('Keys (What we attend TO)')
+    plt.ylabel('Queries (What is attending)')
+    plt.tight_layout()
+    plt.savefig('outputs/attention_visualization.png')
+    print("✅ Saved visualization to outputs/attention_visualization.png")
+    # Print some analysis
+    print("\n" + "="*50)
+    print("Attention Analysis")
+    print("="*50)
+    for i in range(seq_len):
+        max_attn_idx = weights[0, i].argmax().item()
+        max_attn_val = weights[0, i, max_attn_idx].item()
+        print(f"Query {i} attends most to Key {max_attn_idx} (weight: {max_attn_val:.3f})")
+if __name__ == "__main__":
+    test_attention_visualization()

tests/test_models/test_multihead_visual.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# tests/test_models/test_multihead_visual.py
+import torch
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from src.models.attention import MultiHeadAttention
+def visualize_multihead_attention():
+    """
+    Visual test to see what different attention heads learn.
+    Creates a heatmap showing attention patterns for each head.
+    """
+    # Setup
+    torch.manual_seed(42)
+    d_model, num_heads = 512, 8
+    batch_size, seq_len = 1, 10
+    mha = MultiHeadAttention(d_model, num_heads, dropout=0.0)
+    mha.eval()  # No dropout for visualization
+    # Create input with some structure
+    # Let's make tokens attend to nearby tokens
+    X = torch.randn(batch_size, seq_len, d_model)
+    # Add positional bias (tokens are more similar to nearby tokens)
+    for i in range(seq_len):
+        for j in range(seq_len):
+            distance = abs(i - j)
+            X[0, i] += 0.5 * X[0, j] / (distance + 1)
+    # Forward pass
+    output, attn_weights = mha(X, X, X)
+    # attn_weights shape: (1, 8, 10, 10) = batch, heads, query_pos, key_pos
+    attn_weights = attn_weights[0].detach().numpy()  # Remove batch dim: (8, 10, 10)
+    # Create visualization
+    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
+    fig.suptitle('Multi-Head Attention: What Each Head Learns', fontsize=16, y=1.02)
+    for head_idx in range(num_heads):
+        row = head_idx // 4
+        col = head_idx % 4
+        ax = axes[row, col]
+        # Plot attention heatmap for this head
+        sns.heatmap(
+            attn_weights[head_idx],
+            annot=True,
+            fmt='.2f',
+            cmap='viridis',
+            cbar=True,
+            square=True,
+            ax=ax,
+            vmin=0,
+            vmax=attn_weights[head_idx].max(),
+            xticklabels=[f'K{i}' for i in range(seq_len)],
+            yticklabels=[f'Q{i}' for i in range(seq_len)]
+        )
+        ax.set_title(f'Head {head_idx}', fontweight='bold')
+        ax.set_xlabel('Keys (attend TO)')
+        ax.set_ylabel('Queries (attending FROM)')
+    plt.tight_layout()
+    plt.savefig('outputs/multihead_attention_visualization.png', dpi=150, bbox_inches='tight')
+    print("✅ Saved visualization to outputs/multihead_attention_visualization.png")
+    # Print statistics
+    print("\n" + "="*60)
+    print("Multi-Head Attention Analysis")
+    print("="*60)
+    for head_idx in range(num_heads):
+        head_attn = attn_weights[head_idx]
+        # Find dominant pattern
+        diagonal_strength = np.trace(head_attn) / seq_len
+        off_diagonal = (head_attn.sum() - np.trace(head_attn)) / (seq_len * (seq_len - 1))
+        print(f"\nHead {head_idx}:")
+        print(f"  Self-attention strength: {diagonal_strength:.3f}")
+        print(f"  Cross-attention strength: {off_diagonal:.3f}")
+        # Find which position each query attends to most
+        max_attentions = head_attn.argmax(axis=1)
+        print(f"  Attention pattern: {max_attentions.tolist()}")
+def compare_single_vs_multihead():
+    """
+    Compare single-head vs multi-head attention capacity.
+    """
+    torch.manual_seed(42)
+    seq_len, d_model = 8, 512
+    # Create data with two different patterns
+    # Pattern 1: Sequential (token i attends to i+1)
+    # Pattern 2: Pairwise (tokens 0-1, 2-3, 4-5, 6-7 attend to each other)
+    X = torch.randn(1, seq_len, d_model)
+    # Test with 1 head vs 8 heads
+    mha_1head = MultiHeadAttention(d_model, num_heads=1, dropout=0.0)
+    mha_8heads = MultiHeadAttention(d_model, num_heads=8, dropout=0.0)
+    mha_1head.eval()
+    mha_8heads.eval()
+    _, attn_1head = mha_1head(X, X, X)
+    _, attn_8heads = mha_8heads(X, X, X)
+    # Plot comparison
+    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+    # Single head
+    sns.heatmap(
+        attn_1head[0, 0].detach().numpy(),
+        annot=True,
+        fmt='.2f',
+        cmap='viridis',
+        cbar=True,
+        square=True,
+        ax=axes[0]
+    )
+    axes[0].set_title('Single-Head Attention\n(Limited expressiveness)', fontweight='bold')
+    axes[0].set_xlabel('Keys')
+    axes[0].set_ylabel('Queries')
+    # Multi-head average
+    avg_attn = attn_8heads[0].mean(dim=0).detach().numpy()
+    sns.heatmap(
+        avg_attn,
+        annot=True,
+        fmt='.2f',
+        cmap='viridis',
+        cbar=True,
+        square=True,
+        ax=axes[1]
+    )
+    axes[1].set_title('8-Head Attention (Average)\n(Richer patterns)', fontweight='bold')
+    axes[1].set_xlabel('Keys')
+    axes[1].set_ylabel('Queries')
+    plt.tight_layout()
+    plt.savefig('outputs/single_vs_multihead.png', dpi=150, bbox_inches='tight')
+    print("✅ Saved comparison to outputs/single_vs_multihead.png")
+if __name__ == "__main__":
+    import os
+    os.makedirs('outputs', exist_ok=True)
+    print("Visualizing multi-head attention patterns...")
+    visualize_multihead_attention()
+    print("\nComparing single-head vs multi-head...")
+    compare_single_vs_multihead()
+    print("\n" + "="*60)
+    print("✅ All visualizations complete!")
+    print("="*60)