kainatq's picture
Create app.py
5f5bc69 verified
import gradio as gr
from llama_cpp import Llama
import time
import os
# Configuration
MODEL_REPO = "kainatq/quantum-keek-7b-Q4_K_M-GGUF"
MODEL_FILE = "quantum-keek-7b-q4_k_m.gguf"
MODEL_PATH = f"./{MODEL_FILE}"
# Initialize the model
def load_model():
try:
# Download model if not exists
if not os.path.exists(MODEL_PATH):
print("Downloading model... This may take a while.")
from huggingface_hub import hf_hub_download
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
local_dir=".",
local_dir_use_symlinks=False
)
# Initialize Llama with CPU optimization
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096, # Context window
n_threads=2, # Use both vCPUs
n_batch=512,
use_mlock=False, # Don't lock memory (limited RAM)
use_mmap=True, # Use memory mapping
verbose=False
)
print("Model loaded successfully!")
return llm
except Exception as e:
print(f"Error loading model: {e}")
return None
# Load the model
llm = load_model()
def chat_with_ai(message, history, system_prompt, temperature, max_tokens):
"""
Function to handle chat interactions with the AI model
"""
if llm is None:
return "Error: Model not loaded. Please check the console for details."
# Prepare conversation history
conversation = []
# Add system prompt
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
# Add history
for human, assistant in history:
conversation.extend([
{"role": "user", "content": human},
{"role": "assistant", "content": assistant}
])
# Add current message
conversation.append({"role": "user", "content": message})
try:
# Create prompt from conversation
prompt = ""
for msg in conversation:
if msg["role"] == "system":
prompt += f"System: {msg['content']}\n\n"
elif msg["role"] == "user":
prompt += f"User: {msg['content']}\n\n"
elif msg["role"] == "assistant":
prompt += f"Assistant: {msg['content']}\n\n"
prompt += "Assistant:"
# Generate response
start_time = time.time()
response = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
stop=["User:", "System:"],
echo=False,
stream=False
)
generation_time = time.time() - start_time
answer = response['choices'][0]['text'].strip()
# Add generation info
tokens_used = response['usage']['total_tokens']
answer += f"\n\n---\n*Generated in {generation_time:.2f}s using {tokens_used} tokens*"
return answer
except Exception as e:
return f"Error generating response: {str(e)}"
def clear_chat():
"""Clear the chat history"""
return [], ""
# Custom CSS for ChatGPT-like styling
custom_css = """
#chatbot {
min-height: 400px;
border: 1px solid #e0e0e0;
border-radius: 10px;
padding: 20px;
background: #f9f9f9;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.dark #chatbot {
background: #1e1e1e;
border-color: #444;
}
"""
# Create the Gradio interface
with gr.Blocks(
title="πŸͺ Quantum Keek Chat",
theme=gr.themes.Soft(),
css=custom_css
) as demo:
gr.Markdown(
"""
# πŸͺ Quantum Keek Chat
*Powered by Quantum Keek 7B GGUF - Running on CPU with llama.cpp*
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Configuration")
system_prompt = gr.Textbox(
label="System Prompt",
value="You are Quantum Keek, a helpful AI assistant. Provide detailed, thoughtful responses to user queries.",
lines=3,
placeholder="Enter system instructions..."
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher values = more creative, Lower values = more focused"
)
max_tokens = gr.Slider(
minimum=100,
maximum=2048,
value=512,
step=50,
label="Max Tokens",
info="Maximum length of response"
)
clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
gr.Markdown(
"""
---
**Model Info:**
- **Model:** Quantum Keek 7B Q4_K_M
- **Platform:** CPU (llama.cpp)
- **Context:** 4096 tokens
"""
)
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="πŸͺ Quantum Keek",
elem_id="chatbot",
height=500,
show_copy_button=True
)
msg = gr.Textbox(
label="Your message",
placeholder="Type your message here...",
lines=2,
max_lines=5
)
with gr.Row():
submit_btn = gr.Button("πŸš€ Send", variant="primary")
stop_btn = gr.Button("⏹️ Stop", variant="secondary")
# Event handlers
submit_event = msg.submit(
fn=chat_with_ai,
inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
outputs=[chatbot]
).then(
lambda: "", # Clear input
outputs=[msg]
)
submit_btn.click(
fn=chat_with_ai,
inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
outputs=[chatbot]
).then(
lambda: "", # Clear input
outputs=[msg]
)
clear_btn.click(
fn=clear_chat,
outputs=[chatbot, msg]
)
# Stop button functionality
def stop_generation():
# This is a placeholder - in a real implementation you'd need to handle streaming
return "Generation stopped by user."
stop_btn.click(
fn=stop_generation,
outputs=[msg]
)
gr.Markdown(
"""
---
**Note:** This is running on Hugging Face Spaces free tier (2vCPU, 16GB RAM).
Responses may take a few seconds to generate.
"""
)
if __name__ == "__main__":
# Set huggingface token if needed (for gated models)
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here"
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)