import gradio as gr from llama_cpp import Llama import time import os # Configuration MODEL_REPO = "kainatq/quantum-keek-7b-Q4_K_M-GGUF" MODEL_FILE = "quantum-keek-7b-q4_k_m.gguf" MODEL_PATH = f"./{MODEL_FILE}" # Initialize the model def load_model(): try: # Download model if not exists if not os.path.exists(MODEL_PATH): print("Downloading model... This may take a while.") from huggingface_hub import hf_hub_download hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".", local_dir_use_symlinks=False ) # Initialize Llama with CPU optimization llm = Llama( model_path=MODEL_PATH, n_ctx=4096, # Context window n_threads=2, # Use both vCPUs n_batch=512, use_mlock=False, # Don't lock memory (limited RAM) use_mmap=True, # Use memory mapping verbose=False ) print("Model loaded successfully!") return llm except Exception as e: print(f"Error loading model: {e}") return None # Load the model llm = load_model() def chat_with_ai(message, history, system_prompt, temperature, max_tokens): """ Function to handle chat interactions with the AI model """ if llm is None: return "Error: Model not loaded. Please check the console for details." # Prepare conversation history conversation = [] # Add system prompt if system_prompt: conversation.append({"role": "system", "content": system_prompt}) # Add history for human, assistant in history: conversation.extend([ {"role": "user", "content": human}, {"role": "assistant", "content": assistant} ]) # Add current message conversation.append({"role": "user", "content": message}) try: # Create prompt from conversation prompt = "" for msg in conversation: if msg["role"] == "system": prompt += f"System: {msg['content']}\n\n" elif msg["role"] == "user": prompt += f"User: {msg['content']}\n\n" elif msg["role"] == "assistant": prompt += f"Assistant: {msg['content']}\n\n" prompt += "Assistant:" # Generate response start_time = time.time() response = llm( prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.95, stop=["User:", "System:"], echo=False, stream=False ) generation_time = time.time() - start_time answer = response['choices'][0]['text'].strip() # Add generation info tokens_used = response['usage']['total_tokens'] answer += f"\n\n---\n*Generated in {generation_time:.2f}s using {tokens_used} tokens*" return answer except Exception as e: return f"Error generating response: {str(e)}" def clear_chat(): """Clear the chat history""" return [], "" # Custom CSS for ChatGPT-like styling custom_css = """ #chatbot { min-height: 400px; border: 1px solid #e0e0e0; border-radius: 10px; padding: 20px; background: #f9f9f9; } .gradio-container { max-width: 1200px !important; margin: 0 auto !important; } .dark #chatbot { background: #1e1e1e; border-color: #444; } """ # Create the Gradio interface with gr.Blocks( title="đŸĒ Quantum Keek Chat", theme=gr.themes.Soft(), css=custom_css ) as demo: gr.Markdown( """ # đŸĒ Quantum Keek Chat *Powered by Quantum Keek 7B GGUF - Running on CPU with llama.cpp* """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Configuration") system_prompt = gr.Textbox( label="System Prompt", value="You are Quantum Keek, a helpful AI assistant. Provide detailed, thoughtful responses to user queries.", lines=3, placeholder="Enter system instructions..." ) temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature", info="Higher values = more creative, Lower values = more focused" ) max_tokens = gr.Slider( minimum=100, maximum=2048, value=512, step=50, label="Max Tokens", info="Maximum length of response" ) clear_btn = gr.Button("đŸ—‘ī¸ Clear Chat", variant="secondary") gr.Markdown( """ --- **Model Info:** - **Model:** Quantum Keek 7B Q4_K_M - **Platform:** CPU (llama.cpp) - **Context:** 4096 tokens """ ) with gr.Column(scale=2): chatbot = gr.Chatbot( label="đŸĒ Quantum Keek", elem_id="chatbot", height=500, show_copy_button=True ) msg = gr.Textbox( label="Your message", placeholder="Type your message here...", lines=2, max_lines=5 ) with gr.Row(): submit_btn = gr.Button("🚀 Send", variant="primary") stop_btn = gr.Button("âšī¸ Stop", variant="secondary") # Event handlers submit_event = msg.submit( fn=chat_with_ai, inputs=[msg, chatbot, system_prompt, temperature, max_tokens], outputs=[chatbot] ).then( lambda: "", # Clear input outputs=[msg] ) submit_btn.click( fn=chat_with_ai, inputs=[msg, chatbot, system_prompt, temperature, max_tokens], outputs=[chatbot] ).then( lambda: "", # Clear input outputs=[msg] ) clear_btn.click( fn=clear_chat, outputs=[chatbot, msg] ) # Stop button functionality def stop_generation(): # This is a placeholder - in a real implementation you'd need to handle streaming return "Generation stopped by user." stop_btn.click( fn=stop_generation, outputs=[msg] ) gr.Markdown( """ --- **Note:** This is running on Hugging Face Spaces free tier (2vCPU, 16GB RAM). Responses may take a few seconds to generate. """ ) if __name__ == "__main__": # Set huggingface token if needed (for gated models) # os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here" demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )