""" Positional Encoding for Transformer models. Provides sinusoidal position embeddings that inject sequential order information into token representations. Required because self-attention is permutation-invariant and has no inherent notion of token position. Author: Oliver Perrin Date: December 2025 """ import math import torch import torch.nn as nn class PositionalEncoding(nn.Module): """ Implements the sinusoidal positional encoding from "Attention Is All You Need". Formula: PE(pos, 2i) = sin(pos / 10000^(2i/d_model)) PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model)) Where: pos: position in sequence (0 to max_len-1) i: dimension index (0 to d_model/2) Args: d_model: Dimension of the model embeddings max_len: Maximum sequence length to pre-compute dropout: Dropout probability to apply after adding positional encoding Shape: Input: (batch, seq_len, d_model) Output: (batch, seq_len, d_model) Example: >>> pos_enc = PositionalEncoding(d_model=512, max_len=5000) >>> x = torch.randn(32, 100, 512) # (batch, seq, d_model) >>> output = pos_enc(x) >>> output.shape torch.Size([32, 100, 512]) """ def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1): super().__init__() self.dropout = nn.Dropout(p=dropout) # Create a tensor of positions: [0, 1, 2, ..., max_len-1] # Create a tensor of dimension indices: [0, 1, 2, ..., d_model-1] # Compute the division term: 10000^(2i/d_model) # Apply sin to even indices, cos to odd indices # Register as buffer (not a parameter, but part of state_dict) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) pe = torch.zeros(max_len, d_model) pe[:, 0::2] = torch.sin(position * div_term) # Even indices pe[:, 1::2] = torch.cos(position * div_term) # Odd indices pe = pe.unsqueeze(0) self.register_buffer("pe", pe) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Add positional encoding to input embeddings. Args: x: Input embeddings (batch, seq_len, d_model) Returns: x with positional encoding added (batch, seq_len, d_model) """ # Get sequence length from input # Add the appropriate slice of positional encoding # Apply dropout # Return result pe: torch.Tensor = self.pe # type: ignore[assignment] x = x + pe[:, : x.size(1)].requires_grad_(False) # self.pe contains pre-computed encodings for all positions # just need to add the first seq_len positions to x return self.dropout(x) class LearnedPositionalEncoding(nn.Module): """ Learned positional embeddings (used by BERT, GPT, etc.). Note: T5/FLAN-T5 uses relative position bias instead of absolute positional embeddings. When loading from T5, the model uses learned positional encodings that train from scratch. Args: d_model: Dimension of the model embeddings max_len: Maximum sequence length dropout: Dropout probability padding_idx: Index of padding token (used to mask out padding positions if needed) """ def __init__( self, d_model: int, max_len: int = 1024, dropout: float = 0.1, padding_idx: int = 1 ): super().__init__() # Standard learned positional embeddings. # Note: T5's relative position bias is NOT transferred - we train these from scratch. self.embeddings = nn.Embedding(max_len, d_model) self.dropout = nn.Dropout(p=dropout) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Args: x: Input embeddings (batch, seq_len, d_model) """ seq_len = x.size(1) positions = torch.arange(seq_len, dtype=torch.long, device=x.device) # Broadcast to batch positions = positions.unsqueeze(0).expand(x.size(0), -1) pos_embeds = self.embeddings(positions) return self.dropout(x + pos_embeds)