Spaces:
Running
Running
| # build_index.py | |
| import os | |
| import requests | |
| from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| from langchain_community.docstore.document import Document as LCDocument | |
| DOCS_PATH = "docs" | |
| INDEX_PATH = "faiss_index" | |
| def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]: | |
| """ | |
| Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader. | |
| Returns a list of Documents (can be 1 or multiple if you want to split further). | |
| """ | |
| try: | |
| response = requests.get(url, timeout=timeout) | |
| response.raise_for_status() # raise HTTPError if not 200 | |
| except Exception as e: | |
| print(f"[Timeout/Fetch Error] Skipping {url}: {e}") | |
| return [] | |
| # Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader | |
| # (unstructured requires a file-like, we can do in-memory, but let's keep it simple) | |
| temp_filename = "temp_html_file.html" | |
| with open(temp_filename, "w", encoding="utf-8") as f: | |
| f.write(response.text) | |
| loader = UnstructuredHTMLLoader(temp_filename) | |
| docs = loader.load() # returns a list of Document objects | |
| for doc in docs: | |
| doc.metadata["source"] = url | |
| return docs | |
| def load_web_docs(urls: list[str], timeout=5) -> list[Document]: | |
| all_docs = [] | |
| for url in urls: | |
| print(f"Fetching: {url}") | |
| docs_from_url = fetch_html_with_timeout(url, timeout=timeout) | |
| all_docs.extend(docs_from_url) | |
| return all_docs | |
| def load_documents(docs_path=DOCS_PATH): | |
| all_docs = [] | |
| for file_name in os.listdir(docs_path): | |
| file_path = os.path.join(docs_path, file_name) | |
| print(f"Processing file: {file_name}") # Debug log | |
| # 1) Text files | |
| if file_name.lower().endswith(".txt"): | |
| print(" -> Loading as .txt") | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| loaded_docs = loader.load() | |
| all_docs.extend(loaded_docs) | |
| print(f" -> Loaded {len(loaded_docs)} docs from {file_name}") | |
| # 2) PDF | |
| elif file_name.lower().endswith(".pdf"): | |
| print(" -> Loading as .pdf") | |
| loader = PyPDFLoader(file_path) | |
| pdf_docs = loader.load_and_split() | |
| all_docs.extend(pdf_docs) | |
| print(f" -> Loaded {len(pdf_docs)} docs from {file_name}") | |
| # 3) URLs | |
| elif file_name.lower().endswith(".urls"): | |
| print(" -> Loading as .urls") | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| urls = [line.strip() for line in f if line.strip()] | |
| print(f" -> Found {len(urls)} URLs in {file_name}") | |
| if urls: | |
| web_docs = load_web_docs(urls, timeout=5) | |
| print(f" -> Loaded {len(web_docs)} web docs from URLs") | |
| all_docs.extend(web_docs) | |
| else: | |
| print(" -> Skipped: unrecognized file type.") | |
| return all_docs | |
| def build_faiss_index(): | |
| documents = load_documents() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| splitted_docs = text_splitter.split_documents(documents) | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"}) | |
| vectorstore = FAISS.from_documents(splitted_docs, embeddings) | |
| os.makedirs(INDEX_PATH, exist_ok=True) | |
| vectorstore.save_local(INDEX_PATH) | |
| print(f"Vector index saved to {INDEX_PATH}") | |
| if __name__ == "__main__": | |
| build_faiss_index() | |