eousphoros
/

DeepSeek-V3.2-NVFP4

@@ -81,6 +81,14 @@ def load_sharded_model(model, ckpt_path):
         # Get unique shard files
         shard_files = sorted(set(weight_map.values()))
         print(f"Loading {len(shard_files)} shards (streaming to GPU)...")
         model_state = model.state_dict()
         loaded_keys = set()
@@ -105,6 +113,9 @@ def load_sharded_model(model, ckpt_path):
             print(f"Warning: {len(missing)} missing keys in checkpoint")
             for k in list(missing)[:5]:
                 print(f"  - {k}")
     else:
         # Fall back to single file
         single_file = os.path.join(ckpt_path, "model0-mp1.safetensors")
@@ -181,6 +192,48 @@ def generate(
     return completion_tokens
 def main(
     ckpt_path: str,
     config: str,
@@ -214,6 +267,11 @@ def main(
     with open(config) as f:
         args = ModelArgs(**json.load(f))
     print(args)
     print("Creating model on CPU (this may take a while)...")
     with torch.device("cpu"):
         model = Transformer(args)
@@ -221,7 +279,7 @@ def main(
     print("Loading model weights...")
     load_sharded_model(model, ckpt_path)
     model.eval()
-    print("I'm DeepSeek 👋")
     if interactive:
         messages = []

         # Get unique shard files
         shard_files = sorted(set(weight_map.values()))
+        # Check memory before loading
+        try:
+            import psutil
+            mem = psutil.virtual_memory()
+            print(f"Memory: {mem.available / 1e9:.1f}GB available / {mem.total / 1e9:.1f}GB total ({mem.percent:.1f}% used)")
+        except ImportError:
+            pass  # psutil not required
         print(f"Loading {len(shard_files)} shards (streaming to GPU)...")
         model_state = model.state_dict()
         loaded_keys = set()
             print(f"Warning: {len(missing)} missing keys in checkpoint")
             for k in list(missing)[:5]:
                 print(f"  - {k}")
+        # Reattach FP8 scales after loading
+        link_fp8_scales(model)
     else:
         # Fall back to single file
         single_file = os.path.join(ckpt_path, "model0-mp1.safetensors")
     return completion_tokens
+def clear_system_cache():
+    """
+    Clear system cache to free memory (optional optimization).
+    This can help with large models by freeing cached memory.
+    Silently attempts cache clearing; failures are ignored.
+    """
+    try:
+        import subprocess
+        subprocess.run(
+            ['sudo', 'sh', '-c', 'echo 3 > /proc/sys/vm/drop_caches'],
+            check=False, capture_output=True, text=True, timeout=5
+        )
+    except Exception:
+        # Silently ignore if cache clearing fails
+        pass
+def link_fp8_scales(model):
+    """
+    Link FP8 scales to weight tensors after loading.
+    After load_state_dict(), FP8 weights lose their .scale attribute.
+    This function reattaches them.
+    """
+    from model import Linear, ColumnParallelLinear, RowParallelLinear
+    linked = 0
+    for name, module in model.named_modules():
+        if isinstance(module, (Linear, ColumnParallelLinear, RowParallelLinear)):
+            # Check if this is an FP8 layer
+            if hasattr(module, 'weight') and hasattr(module, 'scale'):
+                if module.weight is not None and module.scale is not None:
+                    if module.weight.dtype == torch.float8_e4m3fn:
+                        # Reattach scale as attribute
+                        module.weight.scale = module.scale
+                        linked += 1
+    if linked > 0:
+        print(f"✓ Linked scales for {linked} FP8 layers")
 def main(
     ckpt_path: str,
     config: str,
     with open(config) as f:
         args = ModelArgs(**json.load(f))
     print(args)
+    # Optionally clear cache to free memory before loading large model
+    if rank == 0:
+        clear_system_cache()
     print("Creating model on CPU (this may take a while)...")
     with torch.device("cpu"):
         model = Transformer(args)
     print("Loading model weights...")
     load_sharded_model(model, ckpt_path)
     model.eval()
+    print("DeepSeek V3.2 NVFP4 - Ready")
     if interactive:
         messages = []