Spaces:

ntdservices
/

retriever

Sleeping

App Files Files Community

ntdservices commited on Aug 20

Commit

9ef7cd9

verified ·

1 Parent(s): 366abe4

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -25

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
 import os
 import re
@@ -7,7 +8,7 @@ import faiss
 from sentence_transformers import SentenceTransformer
 from PyPDF2 import PdfReader
-# Optional: if you had an NLI pipeline before, you can keep or remove it.
 try:
     from transformers import pipeline as hf_pipeline
     nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
@@ -47,8 +48,11 @@ def clear_uploads_folder():
 clear_uploads_folder()
 print("✅ Uploads folder cleared.")
-# runtime cache keyed by search-id → (paragraphs_norm, embeddings, faiss-index, spans)
-# spans[i] = (start_char, end_char) in the full merged text for paragraph i
 index_data = {}
 # ── helpers ────────────────────────────────────────────────────────────────────
@@ -62,22 +66,45 @@ def get_paths(sid: str):
     result_file = os.path.join(res_folder, "results.txt")
     return up_folder, res_folder, merged_file, result_file
-def extract_text(file_path: str) -> str:
-    if file_path.lower().endswith('.txt'):
-        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-            return f.read()
-    if file_path.lower().endswith('.pdf'):
         reader = PdfReader(file_path)
-        chunks = []
         for page in reader.pages:
             t = page.extract_text() or ""
-            chunks.append(t)
-        # Light cleanup: tighten excessive gaps, add blank lines between sentences
-        full_text = " ".join(chunks)
         full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
         full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
-        return full_text
-    return ""
 def split_paragraphs_with_spans(merged_text: str):
     """
@@ -105,23 +132,47 @@ def split_paragraphs_with_spans(merged_text: str):
     return paras_norm, spans
 def rebuild_merged_and_index(sid: str):
-    """Build merged.txt, paragraph embeddings, and spans for *this* search id."""
     up_folder, res_folder, merged_file, _ = get_paths(sid)
     merged_text = ""
     for filename in sorted(os.listdir(up_folder)):
-        if filename.lower().endswith((".pdf", ".txt")):
-            merged_text += extract_text(os.path.join(up_folder, filename)).rstrip() + "\n\n"
     with open(merged_file, "w", encoding='utf-8') as f:
         f.write(merged_text)
     paras_norm, spans = split_paragraphs_with_spans(merged_text)
     if not paras_norm:
-        index_data[sid] = ([], None, None, [])
         return
     embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
     embed = np.asarray(embed)
     if embed.ndim == 1:
@@ -130,7 +181,7 @@ def rebuild_merged_and_index(sid: str):
     idx = faiss.IndexFlatIP(embed.shape[1])
     idx.add(embed)
-    index_data[sid] = (paras_norm, embed, idx, spans)
 # ── routes ─────────────────────────────────────────────────────────────────────
 @app.route("/", methods=["GET", "POST"])
@@ -140,7 +191,9 @@ def index():
         sid = str(uuid.uuid4())
     up_folder, _, _, _ = get_paths(sid)
-    paragraphs, embeddings, index_faiss, spans = index_data.get(sid, ([], None, None, []))
     uploaded_filenames = sorted(os.listdir(up_folder))
@@ -163,8 +216,18 @@ def index():
             faiss.normalize_L2(q_embed)
             D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
-            # Keep the FAISS paragraph index so the UI can jump within full text.
-            results = [{"idx": int(i), "text": paragraphs[i]} for i in I[0]]
             _, res_folder, _, result_file = get_paths(sid)
             with open(result_file, "w", encoding='utf-8') as f:
@@ -252,7 +315,7 @@ def api_context():
     except (TypeError, ValueError):
         return jsonify(error="Bad idx"), 400
-    paragraphs, _, _, spans = index_data.get(sid, (None, None, None, None))
     if paragraphs is None or spans is None:
         return jsonify(error="No index for this sid. Upload files first."), 404
@@ -275,4 +338,4 @@ def api_context():
     )
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

+# app.py
 from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
 import os
 import re
 from sentence_transformers import SentenceTransformer
 from PyPDF2 import PdfReader
+# Optional NLI (not required for this feature)
 try:
     from transformers import pipeline as hf_pipeline
     nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
 clear_uploads_folder()
 print("✅ Uploads folder cleared.")
+# runtime cache keyed by search-id:
+# (paragraphs_norm, embeddings, faiss-index, spans, para_file_idx, file_meta)
+# spans[i] = (start_char, end_char) of paragraph i within merged.txt
+# para_file_idx[i] = index into file_meta for paragraph i
+# file_meta[j] = {"name": filename, "start": start_char, "end": end_char, "second_line": str}
 index_data = {}
 # ── helpers ────────────────────────────────────────────────────────────────────
     result_file = os.path.join(res_folder, "results.txt")
     return up_folder, res_folder, merged_file, result_file
+def compute_second_line(raw_text: str) -> str:
+    """Return the 2nd non-empty line if available, else the literal 2nd line, else ''."""
+    lines = raw_text.splitlines()
+    non_empty = [ln.strip() for ln in lines if ln.strip() != ""]
+    if len(non_empty) >= 2:
+        return non_empty[1]
+    if len(lines) >= 2:
+        return lines[1].strip()
+    return ""
+def extract_for_merge_and_second(file_path: str):
+    """
+    Return a tuple (merged_text_piece, second_line_str) for a single file.
+    - For .txt: merged part is raw file text.
+    - For .pdf: merged part is lightly cleaned text; second line is computed
+      using raw extracted line structure as well.
+    """
+    if file_path.lower().endswith(".txt"):
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            raw = f.read()
+        second = compute_second_line(raw)
+        return raw, second
+    if file_path.lower().endswith(".pdf"):
         reader = PdfReader(file_path)
+        pages = []
         for page in reader.pages:
             t = page.extract_text() or ""
+            pages.append(t)
+        raw_lines_joined = "\n".join(pages)  # preserve some line structure for 2nd line
+        second = compute_second_line(raw_lines_joined)
+        # Light cleanup for the merged view
+        full_text = " ".join(pages)
         full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
         full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
+        return full_text, second
+    return "", ""
 def split_paragraphs_with_spans(merged_text: str):
     """
     return paras_norm, spans
 def rebuild_merged_and_index(sid: str):
+    """Build merged.txt, paragraph embeddings, spans, and per-paragraph file mapping."""
     up_folder, res_folder, merged_file, _ = get_paths(sid)
     merged_text = ""
+    file_meta = []  # list of dicts with name, start, end, second_line
+    # Append files in sorted order for stability
     for filename in sorted(os.listdir(up_folder)):
+        if not filename.lower().endswith((".pdf", ".txt")):
+            continue
+        file_path = os.path.join(up_folder, filename)
+        part, second = extract_for_merge_and_second(file_path)
+        part = part.rstrip()
+        if not part:
+            continue
+        start = len(merged_text)
+        merged_text += part + "\n\n"  # add separator after each file
+        end = start + len(part)  # char range covering the file's text (exclude our extra \n\n)
+        file_meta.append({"name": filename, "start": start, "end": end, "second_line": second})
     with open(merged_file, "w", encoding='utf-8') as f:
         f.write(merged_text)
     paras_norm, spans = split_paragraphs_with_spans(merged_text)
     if not paras_norm:
+        index_data[sid] = ([], None, None, [], [], [])
         return
+    # Map each paragraph span to its originating file via start position
+    para_file_idx = []
+    for (pstart, _pend) in spans:
+        assigned = None
+        for j, meta in enumerate(file_meta):
+            next_start = file_meta[j+1]["start"] if j + 1 < len(file_meta) else float("inf")
+            if pstart >= meta["start"] and pstart < next_start:
+                assigned = j
+                break
+        if assigned is None:
+            assigned = max(0, len(file_meta)-1)
+        para_file_idx.append(assigned)
+    # Build embeddings + FAISS
     embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
     embed = np.asarray(embed)
     if embed.ndim == 1:
     idx = faiss.IndexFlatIP(embed.shape[1])
     idx.add(embed)
+    index_data[sid] = (paras_norm, embed, idx, spans, para_file_idx, file_meta)
 # ── routes ─────────────────────────────────────────────────────────────────────
 @app.route("/", methods=["GET", "POST"])
         sid = str(uuid.uuid4())
     up_folder, _, _, _ = get_paths(sid)
+    paragraphs, embeddings, index_faiss, spans, para_file_idx, file_meta = index_data.get(
+        sid, ([], None, None, [], [], [])
+    )
     uploaded_filenames = sorted(os.listdir(up_folder))
             faiss.normalize_L2(q_embed)
             D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
+            # Build result objects with file name + the file's 2nd line
+            for i in I[0]:
+                i = int(i)
+                file_idx = para_file_idx[i] if 0 <= i < len(para_file_idx) else -1
+                fname = file_meta[file_idx]["name"] if 0 <= file_idx < len(file_meta) else "unknown"
+                second = file_meta[file_idx]["second_line"] if 0 <= file_idx < len(file_meta) else ""
+                results.append({
+                    "idx": i,
+                    "text": paragraphs[i],
+                    "file": fname,
+                    "second_line": second
+                })
             _, res_folder, _, result_file = get_paths(sid)
             with open(result_file, "w", encoding='utf-8') as f:
     except (TypeError, ValueError):
         return jsonify(error="Bad idx"), 400
+    paragraphs, _, _, spans, _, _ = index_data.get(sid, (None, None, None, None, None, None))
     if paragraphs is None or spans is None:
         return jsonify(error="No index for this sid. Upload files first."), 404
     )
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)