#!/usr/bin/env python3
"""
Test script for float16 compatibility with pre-quantized model
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_float16_compatibility():
    """Test float16 compatibility with pre-quantized model"""
    
    model_id = "Tonic/petite-elle-L-aime-3-sft"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    logger.info(f"Testing float16 compatibility on device: {device}")
    
    # Test both float32 and float16
    dtypes_to_test = []
    
    if device == "cuda":
        dtypes_to_test = [torch.float32, torch.float16]
    else:
        dtypes_to_test = [torch.float32]  # Only test float32 on CPU
    
    for dtype in dtypes_to_test:
        logger.info(f"\nTesting with dtype: {dtype}")
        
        try:
            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            if tokenizer.pad_token_id is None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            
            # Load model with specific dtype
            model_kwargs = {
                "device_map": "auto" if device == "cuda" else "cpu",
                "torch_dtype": dtype,
                "trust_remote_code": True,
                "low_cpu_mem_usage": True,
            }
            
            logger.info(f"Loading model with {dtype}...")
            model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
            
            # Test generation
            test_prompt = "Bonjour, comment allez-vous?"
            inputs = tokenizer(test_prompt, return_tensors="pt")
            
            if device == "cuda":
                inputs = {k: v.cuda() for k, v in inputs.items()}
            
            logger.info("Generating response...")
            with torch.no_grad():
                output_ids = model.generate(
                    inputs['input_ids'],
                    max_new_tokens=50,
                    temperature=0.7,
                    top_p=0.95,
                    do_sample=True,
                    attention_mask=inputs['attention_mask'],
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    cache_implementation="static"
                )
            
            response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            assistant_response = response[len(test_prompt):].strip()
            
            logger.info(f"✅ {dtype} test successful!")
            logger.info(f"Input: {test_prompt}")
            logger.info(f"Output: {assistant_response}")
            
            # Check memory usage
            if device == "cuda":
                memory_used = torch.cuda.memory_allocated() / 1024**3
                logger.info(f"GPU Memory used: {memory_used:.2f} GB")
            
            # Check model dtype
            logger.info(f"Model dtype: {model.dtype}")
            
            # Clean up
            del model
            torch.cuda.empty_cache() if device == "cuda" else None
            
        except Exception as e:
            logger.error(f"❌ {dtype} test failed: {e}")
            import traceback
            traceback.print_exc()

if __name__ == "__main__":
    test_float16_compatibility()