import gradio as gr
from llama_cpp import Llama
import time
import os

# Configuration
MODEL_REPO = "kainatq/quantum-keek-7b-Q4_K_M-GGUF"
MODEL_FILE = "quantum-keek-7b-q4_k_m.gguf"
MODEL_PATH = f"./{MODEL_FILE}"

# Initialize the model
def load_model():
    try:
        # Download model if not exists
        if not os.path.exists(MODEL_PATH):
            print("Downloading model... This may take a while.")
            from huggingface_hub import hf_hub_download
            hf_hub_download(
                repo_id=MODEL_REPO,
                filename=MODEL_FILE,
                local_dir=".",
                local_dir_use_symlinks=False
            )
        
        # Initialize Llama with CPU optimization
        llm = Llama(
            model_path=MODEL_PATH,
            n_ctx=4096,  # Context window
            n_threads=2,  # Use both vCPUs
            n_batch=512,
            use_mlock=False,  # Don't lock memory (limited RAM)
            use_mmap=True,    # Use memory mapping
            verbose=False
        )
        print("Model loaded successfully!")
        return llm
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Load the model
llm = load_model()

def chat_with_ai(message, history, system_prompt, temperature, max_tokens):
    """
    Function to handle chat interactions with the AI model
    """
    if llm is None:
        return "Error: Model not loaded. Please check the console for details."
    
    # Prepare conversation history
    conversation = []
    
    # Add system prompt
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    
    # Add history
    for human, assistant in history:
        conversation.extend([
            {"role": "user", "content": human},
            {"role": "assistant", "content": assistant}
        ])
    
    # Add current message
    conversation.append({"role": "user", "content": message})
    
    try:
        # Create prompt from conversation
        prompt = ""
        for msg in conversation:
            if msg["role"] == "system":
                prompt += f"System: {msg['content']}\n\n"
            elif msg["role"] == "user":
                prompt += f"User: {msg['content']}\n\n"
            elif msg["role"] == "assistant":
                prompt += f"Assistant: {msg['content']}\n\n"
        
        prompt += "Assistant:"
        
        # Generate response
        start_time = time.time()
        
        response = llm(
            prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            stop=["User:", "System:"],
            echo=False,
            stream=False
        )
        
        generation_time = time.time() - start_time
        answer = response['choices'][0]['text'].strip()
        
        # Add generation info
        tokens_used = response['usage']['total_tokens']
        answer += f"\n\n---\n*Generated in {generation_time:.2f}s using {tokens_used} tokens*"
        
        return answer
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def clear_chat():
    """Clear the chat history"""
    return [], ""

# Custom CSS for ChatGPT-like styling
custom_css = """
#chatbot {
    min-height: 400px;
    border: 1px solid #e0e0e0;
    border-radius: 10px;
    padding: 20px;
    background: #f9f9f9;
}
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto !important;
}
.dark #chatbot {
    background: #1e1e1e;
    border-color: #444;
}
"""

# Create the Gradio interface
with gr.Blocks(
    title="🪐 Quantum Keek Chat",
    theme=gr.themes.Soft(),
    css=custom_css
) as demo:
    
    gr.Markdown(
        """
        # 🪐 Quantum Keek Chat
        *Powered by Quantum Keek 7B GGUF - Running on CPU with llama.cpp*
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Configuration")
            
            system_prompt = gr.Textbox(
                label="System Prompt",
                value="You are Quantum Keek, a helpful AI assistant. Provide detailed, thoughtful responses to user queries.",
                lines=3,
                placeholder="Enter system instructions..."
            )
            
            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Higher values = more creative, Lower values = more focused"
            )
            
            max_tokens = gr.Slider(
                minimum=100,
                maximum=2048,
                value=512,
                step=50,
                label="Max Tokens",
                info="Maximum length of response"
            )
            
            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
            
            gr.Markdown(
                """
                ---
                **Model Info:**
                - **Model:** Quantum Keek 7B Q4_K_M
                - **Platform:** CPU (llama.cpp)
                - **Context:** 4096 tokens
                """
            )
        
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                label="🪐 Quantum Keek",
                elem_id="chatbot",
                height=500,
                show_copy_button=True
            )
            
            msg = gr.Textbox(
                label="Your message",
                placeholder="Type your message here...",
                lines=2,
                max_lines=5
            )
            
            with gr.Row():
                submit_btn = gr.Button("🚀 Send", variant="primary")
                stop_btn = gr.Button("⏹️ Stop", variant="secondary")
    
    # Event handlers
    submit_event = msg.submit(
        fn=chat_with_ai,
        inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
        outputs=[chatbot]
    ).then(
        lambda: "",  # Clear input
        outputs=[msg]
    )
    
    submit_btn.click(
        fn=chat_with_ai,
        inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
        outputs=[chatbot]
    ).then(
        lambda: "",  # Clear input
        outputs=[msg]
    )
    
    clear_btn.click(
        fn=clear_chat,
        outputs=[chatbot, msg]
    )
    
    # Stop button functionality
    def stop_generation():
        # This is a placeholder - in a real implementation you'd need to handle streaming
        return "Generation stopped by user."
    
    stop_btn.click(
        fn=stop_generation,
        outputs=[msg]
    )
    
    gr.Markdown(
        """
        ---
        **Note:** This is running on Hugging Face Spaces free tier (2vCPU, 16GB RAM). 
        Responses may take a few seconds to generate.
        """
    )

if __name__ == "__main__":
    # Set huggingface token if needed (for gated models)
    # os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here"
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )