Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
|
| 2 |
import os
|
| 3 |
import re
|
|
@@ -7,7 +8,7 @@ import faiss
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
|
| 10 |
-
# Optional
|
| 11 |
try:
|
| 12 |
from transformers import pipeline as hf_pipeline
|
| 13 |
nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
|
|
@@ -47,8 +48,11 @@ def clear_uploads_folder():
|
|
| 47 |
clear_uploads_folder()
|
| 48 |
print("β
Uploads folder cleared.")
|
| 49 |
|
| 50 |
-
# runtime cache keyed by search-id
|
| 51 |
-
#
|
|
|
|
|
|
|
|
|
|
| 52 |
index_data = {}
|
| 53 |
|
| 54 |
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -62,22 +66,45 @@ def get_paths(sid: str):
|
|
| 62 |
result_file = os.path.join(res_folder, "results.txt")
|
| 63 |
return up_folder, res_folder, merged_file, result_file
|
| 64 |
|
| 65 |
-
def
|
| 66 |
-
if
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
reader = PdfReader(file_path)
|
| 71 |
-
|
| 72 |
for page in reader.pages:
|
| 73 |
t = page.extract_text() or ""
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
| 77 |
full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
|
| 78 |
full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
|
| 79 |
-
return full_text
|
| 80 |
-
|
|
|
|
| 81 |
|
| 82 |
def split_paragraphs_with_spans(merged_text: str):
|
| 83 |
"""
|
|
@@ -105,23 +132,47 @@ def split_paragraphs_with_spans(merged_text: str):
|
|
| 105 |
return paras_norm, spans
|
| 106 |
|
| 107 |
def rebuild_merged_and_index(sid: str):
|
| 108 |
-
"""Build merged.txt, paragraph embeddings,
|
| 109 |
up_folder, res_folder, merged_file, _ = get_paths(sid)
|
| 110 |
|
| 111 |
merged_text = ""
|
|
|
|
|
|
|
| 112 |
for filename in sorted(os.listdir(up_folder)):
|
| 113 |
-
if filename.lower().endswith((".pdf", ".txt")):
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
with open(merged_file, "w", encoding='utf-8') as f:
|
| 117 |
f.write(merged_text)
|
| 118 |
|
| 119 |
paras_norm, spans = split_paragraphs_with_spans(merged_text)
|
| 120 |
-
|
| 121 |
if not paras_norm:
|
| 122 |
-
index_data[sid] = ([], None, None, [])
|
| 123 |
return
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
|
| 126 |
embed = np.asarray(embed)
|
| 127 |
if embed.ndim == 1:
|
|
@@ -130,7 +181,7 @@ def rebuild_merged_and_index(sid: str):
|
|
| 130 |
idx = faiss.IndexFlatIP(embed.shape[1])
|
| 131 |
idx.add(embed)
|
| 132 |
|
| 133 |
-
index_data[sid] = (paras_norm, embed, idx, spans)
|
| 134 |
|
| 135 |
# ββ routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 136 |
@app.route("/", methods=["GET", "POST"])
|
|
@@ -140,7 +191,9 @@ def index():
|
|
| 140 |
sid = str(uuid.uuid4())
|
| 141 |
|
| 142 |
up_folder, _, _, _ = get_paths(sid)
|
| 143 |
-
paragraphs, embeddings, index_faiss, spans = index_data.get(
|
|
|
|
|
|
|
| 144 |
|
| 145 |
uploaded_filenames = sorted(os.listdir(up_folder))
|
| 146 |
|
|
@@ -163,8 +216,18 @@ def index():
|
|
| 163 |
faiss.normalize_L2(q_embed)
|
| 164 |
D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
_, res_folder, _, result_file = get_paths(sid)
|
| 170 |
with open(result_file, "w", encoding='utf-8') as f:
|
|
@@ -252,7 +315,7 @@ def api_context():
|
|
| 252 |
except (TypeError, ValueError):
|
| 253 |
return jsonify(error="Bad idx"), 400
|
| 254 |
|
| 255 |
-
paragraphs, _, _, spans = index_data.get(sid, (None, None, None, None))
|
| 256 |
if paragraphs is None or spans is None:
|
| 257 |
return jsonify(error="No index for this sid. Upload files first."), 404
|
| 258 |
|
|
@@ -275,4 +338,4 @@ def api_context():
|
|
| 275 |
)
|
| 276 |
|
| 277 |
if __name__ == "__main__":
|
| 278 |
-
app.run(host="0.0.0.0", port=7860)
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
from PyPDF2 import PdfReader
|
| 10 |
|
| 11 |
+
# Optional NLI (not required for this feature)
|
| 12 |
try:
|
| 13 |
from transformers import pipeline as hf_pipeline
|
| 14 |
nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
|
|
|
|
| 48 |
clear_uploads_folder()
|
| 49 |
print("β
Uploads folder cleared.")
|
| 50 |
|
| 51 |
+
# runtime cache keyed by search-id:
|
| 52 |
+
# (paragraphs_norm, embeddings, faiss-index, spans, para_file_idx, file_meta)
|
| 53 |
+
# spans[i] = (start_char, end_char) of paragraph i within merged.txt
|
| 54 |
+
# para_file_idx[i] = index into file_meta for paragraph i
|
| 55 |
+
# file_meta[j] = {"name": filename, "start": start_char, "end": end_char, "second_line": str}
|
| 56 |
index_data = {}
|
| 57 |
|
| 58 |
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 66 |
result_file = os.path.join(res_folder, "results.txt")
|
| 67 |
return up_folder, res_folder, merged_file, result_file
|
| 68 |
|
| 69 |
+
def compute_second_line(raw_text: str) -> str:
|
| 70 |
+
"""Return the 2nd non-empty line if available, else the literal 2nd line, else ''."""
|
| 71 |
+
lines = raw_text.splitlines()
|
| 72 |
+
non_empty = [ln.strip() for ln in lines if ln.strip() != ""]
|
| 73 |
+
if len(non_empty) >= 2:
|
| 74 |
+
return non_empty[1]
|
| 75 |
+
if len(lines) >= 2:
|
| 76 |
+
return lines[1].strip()
|
| 77 |
+
return ""
|
| 78 |
+
|
| 79 |
+
def extract_for_merge_and_second(file_path: str):
|
| 80 |
+
"""
|
| 81 |
+
Return a tuple (merged_text_piece, second_line_str) for a single file.
|
| 82 |
+
- For .txt: merged part is raw file text.
|
| 83 |
+
- For .pdf: merged part is lightly cleaned text; second line is computed
|
| 84 |
+
using raw extracted line structure as well.
|
| 85 |
+
"""
|
| 86 |
+
if file_path.lower().endswith(".txt"):
|
| 87 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 88 |
+
raw = f.read()
|
| 89 |
+
second = compute_second_line(raw)
|
| 90 |
+
return raw, second
|
| 91 |
+
|
| 92 |
+
if file_path.lower().endswith(".pdf"):
|
| 93 |
reader = PdfReader(file_path)
|
| 94 |
+
pages = []
|
| 95 |
for page in reader.pages:
|
| 96 |
t = page.extract_text() or ""
|
| 97 |
+
pages.append(t)
|
| 98 |
+
raw_lines_joined = "\n".join(pages) # preserve some line structure for 2nd line
|
| 99 |
+
second = compute_second_line(raw_lines_joined)
|
| 100 |
+
|
| 101 |
+
# Light cleanup for the merged view
|
| 102 |
+
full_text = " ".join(pages)
|
| 103 |
full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
|
| 104 |
full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
|
| 105 |
+
return full_text, second
|
| 106 |
+
|
| 107 |
+
return "", ""
|
| 108 |
|
| 109 |
def split_paragraphs_with_spans(merged_text: str):
|
| 110 |
"""
|
|
|
|
| 132 |
return paras_norm, spans
|
| 133 |
|
| 134 |
def rebuild_merged_and_index(sid: str):
|
| 135 |
+
"""Build merged.txt, paragraph embeddings, spans, and per-paragraph file mapping."""
|
| 136 |
up_folder, res_folder, merged_file, _ = get_paths(sid)
|
| 137 |
|
| 138 |
merged_text = ""
|
| 139 |
+
file_meta = [] # list of dicts with name, start, end, second_line
|
| 140 |
+
# Append files in sorted order for stability
|
| 141 |
for filename in sorted(os.listdir(up_folder)):
|
| 142 |
+
if not filename.lower().endswith((".pdf", ".txt")):
|
| 143 |
+
continue
|
| 144 |
+
file_path = os.path.join(up_folder, filename)
|
| 145 |
+
part, second = extract_for_merge_and_second(file_path)
|
| 146 |
+
part = part.rstrip()
|
| 147 |
+
if not part:
|
| 148 |
+
continue
|
| 149 |
+
start = len(merged_text)
|
| 150 |
+
merged_text += part + "\n\n" # add separator after each file
|
| 151 |
+
end = start + len(part) # char range covering the file's text (exclude our extra \n\n)
|
| 152 |
+
file_meta.append({"name": filename, "start": start, "end": end, "second_line": second})
|
| 153 |
|
| 154 |
with open(merged_file, "w", encoding='utf-8') as f:
|
| 155 |
f.write(merged_text)
|
| 156 |
|
| 157 |
paras_norm, spans = split_paragraphs_with_spans(merged_text)
|
|
|
|
| 158 |
if not paras_norm:
|
| 159 |
+
index_data[sid] = ([], None, None, [], [], [])
|
| 160 |
return
|
| 161 |
|
| 162 |
+
# Map each paragraph span to its originating file via start position
|
| 163 |
+
para_file_idx = []
|
| 164 |
+
for (pstart, _pend) in spans:
|
| 165 |
+
assigned = None
|
| 166 |
+
for j, meta in enumerate(file_meta):
|
| 167 |
+
next_start = file_meta[j+1]["start"] if j + 1 < len(file_meta) else float("inf")
|
| 168 |
+
if pstart >= meta["start"] and pstart < next_start:
|
| 169 |
+
assigned = j
|
| 170 |
+
break
|
| 171 |
+
if assigned is None:
|
| 172 |
+
assigned = max(0, len(file_meta)-1)
|
| 173 |
+
para_file_idx.append(assigned)
|
| 174 |
+
|
| 175 |
+
# Build embeddings + FAISS
|
| 176 |
embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
|
| 177 |
embed = np.asarray(embed)
|
| 178 |
if embed.ndim == 1:
|
|
|
|
| 181 |
idx = faiss.IndexFlatIP(embed.shape[1])
|
| 182 |
idx.add(embed)
|
| 183 |
|
| 184 |
+
index_data[sid] = (paras_norm, embed, idx, spans, para_file_idx, file_meta)
|
| 185 |
|
| 186 |
# ββ routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
@app.route("/", methods=["GET", "POST"])
|
|
|
|
| 191 |
sid = str(uuid.uuid4())
|
| 192 |
|
| 193 |
up_folder, _, _, _ = get_paths(sid)
|
| 194 |
+
paragraphs, embeddings, index_faiss, spans, para_file_idx, file_meta = index_data.get(
|
| 195 |
+
sid, ([], None, None, [], [], [])
|
| 196 |
+
)
|
| 197 |
|
| 198 |
uploaded_filenames = sorted(os.listdir(up_folder))
|
| 199 |
|
|
|
|
| 216 |
faiss.normalize_L2(q_embed)
|
| 217 |
D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
|
| 218 |
|
| 219 |
+
# Build result objects with file name + the file's 2nd line
|
| 220 |
+
for i in I[0]:
|
| 221 |
+
i = int(i)
|
| 222 |
+
file_idx = para_file_idx[i] if 0 <= i < len(para_file_idx) else -1
|
| 223 |
+
fname = file_meta[file_idx]["name"] if 0 <= file_idx < len(file_meta) else "unknown"
|
| 224 |
+
second = file_meta[file_idx]["second_line"] if 0 <= file_idx < len(file_meta) else ""
|
| 225 |
+
results.append({
|
| 226 |
+
"idx": i,
|
| 227 |
+
"text": paragraphs[i],
|
| 228 |
+
"file": fname,
|
| 229 |
+
"second_line": second
|
| 230 |
+
})
|
| 231 |
|
| 232 |
_, res_folder, _, result_file = get_paths(sid)
|
| 233 |
with open(result_file, "w", encoding='utf-8') as f:
|
|
|
|
| 315 |
except (TypeError, ValueError):
|
| 316 |
return jsonify(error="Bad idx"), 400
|
| 317 |
|
| 318 |
+
paragraphs, _, _, spans, _, _ = index_data.get(sid, (None, None, None, None, None, None))
|
| 319 |
if paragraphs is None or spans is None:
|
| 320 |
return jsonify(error="No index for this sid. Upload files first."), 404
|
| 321 |
|
|
|
|
| 338 |
)
|
| 339 |
|
| 340 |
if __name__ == "__main__":
|
| 341 |
+
app.run(host="0.0.0.0", port=7860)
|