# app.py - Hugging Face Space App (PDF Summarizer & QnA) import os import gc import tempfile import gradio as gr import torch import numpy as np import faiss from typing import Tuple, Dict, Any, Optional import spaces from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # Avoid tokenizer parallelism warnings os.environ["TOKENIZERS_PARALLELISM"] = "false" # ------------------ CONFIG ------------------ # LLM_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MAX_PROMPT_LENGTH = 28000 # Max characters to feed into the LLM # ------------------ PROMPT TEMPLATES ------------------ # QA_PROMPT_TEMPLATE = ( "System: You are a helpful assistant. Answer the user's question based *only* on the provided context. " "If the answer is not found in the context, state that clearly.\n\n" "Context:\n---\n{context}\n---\n\nQuestion: {question}\n\nAnswer:" ) SUMMARY_PROMPTS = { "Quick": ( "You are an expert academic summarizer. Provide a single, concise paragraph that summarizes the absolute key takeaway of the following document. " "Be brief and direct.\n\nDocument:\n---\n{text}\n---\n\nQuick Summary:" ), "Standard": ( "You are an expert academic summarizer. Provide a detailed, well-structured summary of the following document. " "Cover the key points, methodology, findings, and conclusions.\n\n" "Document:\n---\n{text}\n---\n\nStandard Summary:" ), "Detailed": ( "You are an expert academic summarizer. Provide a highly detailed and comprehensive summary of the following document. " "Go into depth on the methodology, specific results, limitations, and any mention of future work. Use multiple paragraphs for structure.\n\n" "Document:\n---\n{text}\n---\n\nDetailed Summary:" ) } # ------------------ MEMORY & MODEL MANAGEMENT ------------------ # class ModelManager: _llm_pipe = None _embed_model = None @classmethod def _clear_gpu_memory(cls): """Frees up GPU memory by deleting models and clearing the cache.""" models = [cls._llm_pipe, cls._embed_model] for model in models: if model: try: del model except Exception: pass cls._llm_pipe = None cls._embed_model = None gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print("[Memory] GPU Memory Cleared.") @classmethod def get_llm_pipeline(cls): """Loads and returns the LLM pipeline, ensuring no other models are loaded.""" if cls._llm_pipe is None: cls._clear_gpu_memory() print("[LLM] Loading model...") try: tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( LLM_MODEL_NAME, device_map=DEVICE, torch_dtype=torch.bfloat16 ) cls._llm_pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, temperature=0.2, top_p=0.95, ) print("[LLM] Model loaded successfully.") except Exception as e: print(f"[LLM] Failed to load model: {e}") return None return cls._llm_pipe @classmethod def get_embedding_model(cls): """Loads and returns the embedding model, ensuring the LLM is not loaded.""" # import locally to avoid import-time cost if not needed from langchain_huggingface import HuggingFaceEmbeddings if cls._embed_model is None: cls._clear_gpu_memory() print("[Embed] Loading embedding model...") try: cls._embed_model = HuggingFaceEmbeddings( model_name=EMBED_MODEL_NAME, model_kwargs={"device": DEVICE}, encode_kwargs={"normalize_embeddings": True} ) print("[Embed] Embedding model loaded successfully.") except Exception as e: print(f"[Embed] Failed to load model: {e}") return None return cls._embed_model # ------------------ CORE LOGIC FUNCTIONS ------------------ # @spaces.GPU def invoke_llm(prompt_str: str) -> str: """Invokes the LLM with a given prompt.""" if len(prompt_str) > MAX_PROMPT_LENGTH: prompt_str = prompt_str[:MAX_PROMPT_LENGTH] print(f"[invoke_llm] Prompt truncated to {MAX_PROMPT_LENGTH} characters.") try: pipe = ModelManager.get_llm_pipeline() if not pipe: return "Error: LLM could not be loaded." with torch.no_grad(): outputs = pipe(prompt_str) if isinstance(outputs, list) and outputs and "generated_text" in outputs[0]: # remove the prompt if the model echoed it return outputs[0]["generated_text"].replace(prompt_str, "").strip() return "No valid response was generated." except Exception as e: print(f"[invoke_llm] Error: {e}") return f"LLM invocation failed: {e}" @spaces.GPU def process_pdf_and_index(pdf_path: str) -> Tuple[str, Optional[Dict[str, Any]]]: """Processes a PDF, creates embeddings, and builds a FAISS index.""" from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter if not pdf_path: return "No file path provided.", None try: print("[Process] Loading and splitting PDF...") docs = PyMuPDFLoader(pdf_path).load() chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150).split_documents(docs) texts = [c.page_content for c in chunks if c.page_content.strip()] if not texts: return "No text could be extracted from the PDF.", None print(f"[Process] Extracted {len(texts)} text chunks.") embed_model = ModelManager.get_embedding_model() if not embed_model: return "Could not load embedding model.", None print(f"[Process] Creating embeddings...") embeddings = embed_model.embed_documents(texts) emb_np = np.array(embeddings, dtype=np.float32) print("[Process] Building and saving FAISS index...") index = faiss.IndexFlatL2(emb_np.shape[1]) index.add(emb_np) with tempfile.NamedTemporaryFile(delete=False, suffix=".faiss") as f: index_path = f.name faiss.write_index(index, index_path) state_bundle = {"index_path": index_path, "texts": texts} return f"Successfully processed and indexed {len(texts)} chunks.", state_bundle except Exception as e: print(f"[process_pdf] Exception: {e}") return f"Error processing PDF: {e}", None @spaces.GPU def retrieve_and_answer(question: str, state_bundle: Dict[str, Any]) -> Tuple[str, str]: """Retrieves context and generates an answer for a given question.""" if not (state_bundle and "index_path" in state_bundle): return "Please upload and process a PDF first.", "" try: embed_model = ModelManager.get_embedding_model() if not embed_model: return "Error loading embedding model.", "" index = faiss.read_index(state_bundle["index_path"]) texts = state_bundle.get("texts", []) query_embedding = embed_model.embed_query(question) q_arr = np.array([query_embedding], dtype=np.float32) _, indices = index.search(q_arr, k=5) sources = [texts[idx] for idx in indices[0] if 0 <= idx < len(texts)] if not sources: return "Could not find relevant information.", "" context = "\n\n---\n\n".join(sources) sources_preview = "\n\n---\n\n".join(s[:500] + "..." for s in sources) prompt = QA_PROMPT_TEMPLATE.format(context=context, question=question) answer = invoke_llm(prompt) return answer, sources_preview except Exception as e: print(f"[retrieve_and_answer] Error: {e}") return f"An error occurred: {e}", "" @spaces.GPU def summarize_document(state_bundle: Dict[str, Any], summary_type: str) -> Tuple[str, Optional[str]]: """Generates a summary of the document and saves it to a temporary file.""" if not (state_bundle and "texts" in state_bundle): return "Please upload and process a PDF first.", None texts = state_bundle.get("texts", []) if not texts: return "No text available to summarize.", None full_text = "\n\n".join(texts) prompt_template = SUMMARY_PROMPTS.get(summary_type, SUMMARY_PROMPTS["Standard"]) prompt = prompt_template.format(text=full_text) print(f"[Summarize] Generating '{summary_type}' summary...") final_summary = invoke_llm(prompt) # Save the summary to a temporary text file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") temp_file.write(final_summary) temp_file.close() return final_summary, temp_file.name # ------------------ GRADIO UI ------------------ # with gr.Blocks(title="PDF Summarizer & Assistant", theme=gr.themes.Soft()) as demo: gr.Markdown("# 📚 PDF Summarizer & Q&A Assistant") gr.Markdown("Upload a PDF to generate a summary or ask questions about its content.") state = gr.State() with gr.Row(): pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath") process_btn = gr.Button("Process PDF", variant="primary") status_output = gr.Textbox(label="Status", interactive=False) with gr.Tabs(): with gr.TabItem("Summarization"): gr.Markdown("### Generate a Summary") gr.Markdown("Select the level of detail you want in the summary.") summary_type_radio = gr.Radio( choices=["Quick", "Standard", "Detailed"], value="Standard", label="Summary Type" ) summary_btn = gr.Button("Generate Summary", variant="secondary") out_summary = gr.Textbox(label="Document Summary", lines=20, max_lines=25) download_btn = gr.DownloadButton("Download Summary", visible=False) with gr.TabItem("Question & Answer"): gr.Markdown("### Ask a Question") gr.Markdown("Ask a specific question about the document's content.") q_text = gr.Textbox(label="Your Question", placeholder="e.g., What was the main conclusion of the study?") q_btn = gr.Button("Get Answer", variant="secondary") q_out = gr.Textbox(label="Answer", lines=8) q_sources = gr.Textbox(label="Retrieved Sources", lines=8, max_lines=10) # Event Handlers - wrapper for process to return exactly the outputs wired to the UI def handle_process(pdf_file): """Wrapper to handle PDF processing and clear old outputs.""" if pdf_file is None: return "Please upload a file first.", None, "", "", "", "", None status_msg, bundle = process_pdf_and_index(pdf_file.name) # return: status, state, out_summary, q_text, q_out, q_sources, download_file return status_msg, bundle, "", "", "", "", None process_btn.click( fn=handle_process, inputs=[pdf_in], outputs=[status_output, state, out_summary, q_text, q_out, q_sources, download_btn] ) q_btn.click( fn=retrieve_and_answer, inputs=[q_text, state], outputs=[q_out, q_sources] ) summary_btn.click( fn=summarize_document, inputs=[state, summary_type_radio], outputs=[out_summary, download_btn] ) if __name__ == "__main__": demo.launch(share=False, show_error=True)