eousphoros
/

DeepSeek-V3.2-NVFP4

@@ -16,6 +16,7 @@ import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
 from typing import Tuple, Optional
 # NVFP4 E2M1 lookup table for dequantization
@@ -24,6 +25,19 @@ NVFP4_LUT = torch.tensor([
     -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,  # negative values
 ], dtype=torch.float32)
 # Block size for NVFP4 (16 elements per scale)
 NVFP4_BLOCK_SIZE = 16
@@ -109,8 +123,8 @@ def dequantize_nvfp4(
     high = (packed >> 4) & 0x0F
     fp4_tensor = torch.stack([low, high], dim=-1).reshape(M, K)
-    # Lookup table dequantization
-    lut = NVFP4_LUT.to(device=packed.device)
     tensor = lut[fp4_tensor.long()]
     # Apply dual-level scales
@@ -412,7 +426,7 @@ def test_nvfp4_gemm():
     # Compare
     error = (ref - out_deq).abs().mean()
-    print(f"✅ NVFP4 GEMM dequant test: mean abs error = {error:.6f}")
     return True

 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
 from typing import Tuple, Optional
+import functools
 # NVFP4 E2M1 lookup table for dequantization
     -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,  # negative values
 ], dtype=torch.float32)
+@functools.lru_cache(maxsize=8)
+def _get_nvfp4_lut(device_str: str) -> torch.Tensor:
+    """Get NVFP4 lookup table on specified device (cached).
+    Args:
+        device_str: Device string (e.g., 'cpu', 'cuda:0')
+    Returns:
+        NVFP4 lookup table on the specified device
+    """
+    return NVFP4_LUT.to(device=device_str)
 # Block size for NVFP4 (16 elements per scale)
 NVFP4_BLOCK_SIZE = 16
     high = (packed >> 4) & 0x0F
     fp4_tensor = torch.stack([low, high], dim=-1).reshape(M, K)
+    # Lookup table dequantization (use cached LUT for efficiency)
+    lut = _get_nvfp4_lut(str(packed.device))
     tensor = lut[fp4_tensor.long()]
     # Apply dual-level scales
     # Compare
     error = (ref - out_deq).abs().mean()
+    print(f"PASS: NVFP4 GEMM dequant test: mean abs error = {error:.6f}")
     return True