Spaces:

kainatq
/

Quantum-keek-chat

Build error

App Files Files Community

Quantum-keek-chat / app.py

kainatq

Create app.py

5f5bc69 verified 16 days ago

raw

history blame contribute delete

7.17 kB

	import gradio as gr
	from llama_cpp import Llama
	import time
	import os

	# Configuration
	MODEL_REPO = "kainatq/quantum-keek-7b-Q4_K_M-GGUF"
	MODEL_FILE = "quantum-keek-7b-q4_k_m.gguf"
	MODEL_PATH = f"./{MODEL_FILE}"

	# Initialize the model
	def load_model():
	try:
	# Download model if not exists
	if not os.path.exists(MODEL_PATH):
	print("Downloading model... This may take a while.")
	from huggingface_hub import hf_hub_download
	hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	local_dir=".",
	local_dir_use_symlinks=False
	)

	# Initialize Llama with CPU optimization
	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096, # Context window
	n_threads=2, # Use both vCPUs
	n_batch=512,
	use_mlock=False, # Don't lock memory (limited RAM)
	use_mmap=True, # Use memory mapping
	verbose=False
	)
	print("Model loaded successfully!")
	return llm
	except Exception as e:
	print(f"Error loading model: {e}")
	return None

	# Load the model
	llm = load_model()

	def chat_with_ai(message, history, system_prompt, temperature, max_tokens):
	"""
	Function to handle chat interactions with the AI model
	"""
	if llm is None:
	return "Error: Model not loaded. Please check the console for details."

	# Prepare conversation history
	conversation = []

	# Add system prompt
	if system_prompt:
	conversation.append({"role": "system", "content": system_prompt})

	# Add history
	for human, assistant in history:
	conversation.extend([
	{"role": "user", "content": human},
	{"role": "assistant", "content": assistant}
	])

	# Add current message
	conversation.append({"role": "user", "content": message})

	try:
	# Create prompt from conversation
	prompt = ""
	for msg in conversation:
	if msg["role"] == "system":
	prompt += f"System: {msg['content']}\n\n"
	elif msg["role"] == "user":
	prompt += f"User: {msg['content']}\n\n"
	elif msg["role"] == "assistant":
	prompt += f"Assistant: {msg['content']}\n\n"

	prompt += "Assistant:"

	# Generate response
	start_time = time.time()

	response = llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.95,
	stop=["User:", "System:"],
	echo=False,
	stream=False
	)

	generation_time = time.time() - start_time
	answer = response['choices'][0]['text'].strip()

	# Add generation info
	tokens_used = response['usage']['total_tokens']
	answer += f"\n\n---\nGenerated in {generation_time:.2f}s using {tokens_used} tokens"

	return answer

	except Exception as e:
	return f"Error generating response: {str(e)}"

	def clear_chat():
	"""Clear the chat history"""
	return [], ""

	# Custom CSS for ChatGPT-like styling
	custom_css = """
	#chatbot {
	min-height: 400px;
	border: 1px solid #e0e0e0;
	border-radius: 10px;
	padding: 20px;
	background: #f9f9f9;
	}
	.gradio-container {
	max-width: 1200px !important;
	margin: 0 auto !important;
	}
	.dark #chatbot {
	background: #1e1e1e;
	border-color: #444;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(
	title="🪐 Quantum Keek Chat",
	theme=gr.themes.Soft(),
	css=custom_css
	) as demo:

	gr.Markdown(
	"""
	# 🪐 Quantum Keek Chat
	Powered by Quantum Keek 7B GGUF - Running on CPU with llama.cpp
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Configuration")

	system_prompt = gr.Textbox(
	label="System Prompt",
	value="You are Quantum Keek, a helpful AI assistant. Provide detailed, thoughtful responses to user queries.",
	lines=3,
	placeholder="Enter system instructions..."
	)

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Higher values = more creative, Lower values = more focused"
	)

	max_tokens = gr.Slider(
	minimum=100,
	maximum=2048,
	value=512,
	step=50,
	label="Max Tokens",
	info="Maximum length of response"
	)

	clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

	gr.Markdown(
	"""
	---
	Model Info:
	- Model: Quantum Keek 7B Q4_K_M
	- Platform: CPU (llama.cpp)
	- Context: 4096 tokens
	"""
	)

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	label="🪐 Quantum Keek",
	elem_id="chatbot",
	height=500,
	show_copy_button=True
	)

	msg = gr.Textbox(
	label="Your message",
	placeholder="Type your message here...",
	lines=2,
	max_lines=5
	)

	with gr.Row():
	submit_btn = gr.Button("🚀 Send", variant="primary")
	stop_btn = gr.Button("⏹️ Stop", variant="secondary")

	# Event handlers
	submit_event = msg.submit(
	fn=chat_with_ai,
	inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
	outputs=[chatbot]
	).then(
	lambda: "", # Clear input
	outputs=[msg]
	)

	submit_btn.click(
	fn=chat_with_ai,
	inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
	outputs=[chatbot]
	).then(
	lambda: "", # Clear input
	outputs=[msg]
	)

	clear_btn.click(
	fn=clear_chat,
	outputs=[chatbot, msg]
	)

	# Stop button functionality
	def stop_generation():
	# This is a placeholder - in a real implementation you'd need to handle streaming
	return "Generation stopped by user."

	stop_btn.click(
	fn=stop_generation,
	outputs=[msg]
	)

	gr.Markdown(
	"""
	---
	Note: This is running on Hugging Face Spaces free tier (2vCPU, 16GB RAM).
	Responses may take a few seconds to generate.
	"""
	)

	if __name__ == "__main__":
	# Set huggingface token if needed (for gated models)
	# os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here"

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)