Spaces:
Build error
Build error
| import gradio as gr | |
| from llama_cpp import Llama | |
| import time | |
| import os | |
| # Configuration | |
| MODEL_REPO = "kainatq/quantum-keek-7b-Q4_K_M-GGUF" | |
| MODEL_FILE = "quantum-keek-7b-q4_k_m.gguf" | |
| MODEL_PATH = f"./{MODEL_FILE}" | |
| # Initialize the model | |
| def load_model(): | |
| try: | |
| # Download model if not exists | |
| if not os.path.exists(MODEL_PATH): | |
| print("Downloading model... This may take a while.") | |
| from huggingface_hub import hf_hub_download | |
| hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE, | |
| local_dir=".", | |
| local_dir_use_symlinks=False | |
| ) | |
| # Initialize Llama with CPU optimization | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=4096, # Context window | |
| n_threads=2, # Use both vCPUs | |
| n_batch=512, | |
| use_mlock=False, # Don't lock memory (limited RAM) | |
| use_mmap=True, # Use memory mapping | |
| verbose=False | |
| ) | |
| print("Model loaded successfully!") | |
| return llm | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return None | |
| # Load the model | |
| llm = load_model() | |
| def chat_with_ai(message, history, system_prompt, temperature, max_tokens): | |
| """ | |
| Function to handle chat interactions with the AI model | |
| """ | |
| if llm is None: | |
| return "Error: Model not loaded. Please check the console for details." | |
| # Prepare conversation history | |
| conversation = [] | |
| # Add system prompt | |
| if system_prompt: | |
| conversation.append({"role": "system", "content": system_prompt}) | |
| # Add history | |
| for human, assistant in history: | |
| conversation.extend([ | |
| {"role": "user", "content": human}, | |
| {"role": "assistant", "content": assistant} | |
| ]) | |
| # Add current message | |
| conversation.append({"role": "user", "content": message}) | |
| try: | |
| # Create prompt from conversation | |
| prompt = "" | |
| for msg in conversation: | |
| if msg["role"] == "system": | |
| prompt += f"System: {msg['content']}\n\n" | |
| elif msg["role"] == "user": | |
| prompt += f"User: {msg['content']}\n\n" | |
| elif msg["role"] == "assistant": | |
| prompt += f"Assistant: {msg['content']}\n\n" | |
| prompt += "Assistant:" | |
| # Generate response | |
| start_time = time.time() | |
| response = llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| stop=["User:", "System:"], | |
| echo=False, | |
| stream=False | |
| ) | |
| generation_time = time.time() - start_time | |
| answer = response['choices'][0]['text'].strip() | |
| # Add generation info | |
| tokens_used = response['usage']['total_tokens'] | |
| answer += f"\n\n---\n*Generated in {generation_time:.2f}s using {tokens_used} tokens*" | |
| return answer | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| def clear_chat(): | |
| """Clear the chat history""" | |
| return [], "" | |
| # Custom CSS for ChatGPT-like styling | |
| custom_css = """ | |
| #chatbot { | |
| min-height: 400px; | |
| border: 1px solid #e0e0e0; | |
| border-radius: 10px; | |
| padding: 20px; | |
| background: #f9f9f9; | |
| } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| } | |
| .dark #chatbot { | |
| background: #1e1e1e; | |
| border-color: #444; | |
| } | |
| """ | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| title="πͺ Quantum Keek Chat", | |
| theme=gr.themes.Soft(), | |
| css=custom_css | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # πͺ Quantum Keek Chat | |
| *Powered by Quantum Keek 7B GGUF - Running on CPU with llama.cpp* | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Configuration") | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| value="You are Quantum Keek, a helpful AI assistant. Provide detailed, thoughtful responses to user queries.", | |
| lines=3, | |
| placeholder="Enter system instructions..." | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher values = more creative, Lower values = more focused" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=100, | |
| maximum=2048, | |
| value=512, | |
| step=50, | |
| label="Max Tokens", | |
| info="Maximum length of response" | |
| ) | |
| clear_btn = gr.Button("ποΈ Clear Chat", variant="secondary") | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Model Info:** | |
| - **Model:** Quantum Keek 7B Q4_K_M | |
| - **Platform:** CPU (llama.cpp) | |
| - **Context:** 4096 tokens | |
| """ | |
| ) | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot( | |
| label="πͺ Quantum Keek", | |
| elem_id="chatbot", | |
| height=500, | |
| show_copy_button=True | |
| ) | |
| msg = gr.Textbox( | |
| label="Your message", | |
| placeholder="Type your message here...", | |
| lines=2, | |
| max_lines=5 | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("π Send", variant="primary") | |
| stop_btn = gr.Button("βΉοΈ Stop", variant="secondary") | |
| # Event handlers | |
| submit_event = msg.submit( | |
| fn=chat_with_ai, | |
| inputs=[msg, chatbot, system_prompt, temperature, max_tokens], | |
| outputs=[chatbot] | |
| ).then( | |
| lambda: "", # Clear input | |
| outputs=[msg] | |
| ) | |
| submit_btn.click( | |
| fn=chat_with_ai, | |
| inputs=[msg, chatbot, system_prompt, temperature, max_tokens], | |
| outputs=[chatbot] | |
| ).then( | |
| lambda: "", # Clear input | |
| outputs=[msg] | |
| ) | |
| clear_btn.click( | |
| fn=clear_chat, | |
| outputs=[chatbot, msg] | |
| ) | |
| # Stop button functionality | |
| def stop_generation(): | |
| # This is a placeholder - in a real implementation you'd need to handle streaming | |
| return "Generation stopped by user." | |
| stop_btn.click( | |
| fn=stop_generation, | |
| outputs=[msg] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Note:** This is running on Hugging Face Spaces free tier (2vCPU, 16GB RAM). | |
| Responses may take a few seconds to generate. | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| # Set huggingface token if needed (for gated models) | |
| # os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here" | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) |