ntdservices commited on
Commit
2b041e7
Β·
verified Β·
1 Parent(s): 0f84df9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -52
app.py CHANGED
@@ -5,23 +5,27 @@ import uuid
5
  import numpy as np
6
  import faiss
7
  from sentence_transformers import SentenceTransformer
8
- from transformers import pipeline
9
  from PyPDF2 import PdfReader
10
 
11
- print("βœ… App starting...")
 
 
 
 
 
 
 
 
12
  print("⏳ Loading SentenceTransformer model...")
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
- print("βœ… Model loaded.")
15
-
16
- print("⏳ Loading NLI pipeline...")
17
- nli = pipeline("text-classification", model="microsoft/deberta-large-mnli")
18
- print("βœ… NLI pipeline loaded.")
19
 
20
  app = Flask(__name__)
21
 
22
  # ── base folders ───────────────────────────────────────────────────────────────
23
- BASE_UPLOADS = os.path.join(os.path.dirname(__file__), "uploads")
24
- BASE_RESULTS = os.path.join(os.path.dirname(__file__), "results")
 
25
  os.makedirs(BASE_UPLOADS, exist_ok=True)
26
  os.makedirs(BASE_RESULTS, exist_ok=True)
27
 
@@ -39,10 +43,12 @@ def clear_uploads_folder():
39
  os.rmdir(path)
40
  else:
41
  os.remove(path)
 
42
  clear_uploads_folder()
43
  print("βœ… Uploads folder cleared.")
44
 
45
- # runtime cache keyed by search-id β†’ (paragraphs, embeddings, faiss-index)
 
46
  index_data = {}
47
 
48
  # ── helpers ────────────────────────────────────────────────────────────────────
@@ -56,38 +62,67 @@ def get_paths(sid: str):
56
  result_file = os.path.join(res_folder, "results.txt")
57
  return up_folder, res_folder, merged_file, result_file
58
 
59
- def extract_text(file_path):
60
- if file_path.endswith('.txt'):
61
- with open(file_path, 'r', encoding='utf-8') as f:
62
  return f.read()
63
- elif file_path.endswith('.pdf'):
64
  reader = PdfReader(file_path)
65
- full_text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
 
 
 
 
 
66
  full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
67
- full_text = re.sub(r'(?<=[a-z])\.\s+(?=[A-Z])', '.\n\n', full_text)
68
  full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
69
  return full_text
70
  return ""
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def rebuild_merged_and_index(sid: str):
73
- """Re-embed everything for *this* search id."""
74
- up_folder, _, merged_file, _ = get_paths(sid)
75
 
76
  merged_text = ""
77
- for filename in os.listdir(up_folder):
78
  if filename.lower().endswith((".pdf", ".txt")):
79
- merged_text += extract_text(os.path.join(up_folder, filename)) + "\n\n"
80
 
81
  with open(merged_file, "w", encoding='utf-8') as f:
82
  f.write(merged_text)
83
 
84
- paras = re.split(r'\n\s*\n+', merged_text)
85
- paras = [p.strip().replace('\n', ' ') for p in paras if len(p.strip().split()) > 4]
86
- if not paras:
87
- index_data[sid] = ([], None, None)
88
  return
89
 
90
- embed = model.encode(paras, batch_size=32, show_progress_bar=False)
91
  embed = np.asarray(embed)
92
  if embed.ndim == 1:
93
  embed = embed[np.newaxis, :]
@@ -95,7 +130,7 @@ def rebuild_merged_and_index(sid: str):
95
  idx = faiss.IndexFlatIP(embed.shape[1])
96
  idx.add(embed)
97
 
98
- index_data[sid] = (paras, embed, idx)
99
 
100
  # ── routes ─────────────────────────────────────────────────────────────────────
101
  @app.route("/", methods=["GET", "POST"])
@@ -105,7 +140,7 @@ def index():
105
  sid = str(uuid.uuid4())
106
 
107
  up_folder, _, _, _ = get_paths(sid)
108
- paragraphs, embeddings, index_faiss = index_data.get(sid, ([], None, None))
109
 
110
  uploaded_filenames = sorted(os.listdir(up_folder))
111
 
@@ -128,7 +163,7 @@ def index():
128
  faiss.normalize_L2(q_embed)
129
  D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
130
 
131
- # Keep both the text and the FAISS paragraph index so the UI can fetch context.
132
  results = [{"idx": int(i), "text": paragraphs[i]} for i in I[0]]
133
 
134
  _, res_folder, _, result_file = get_paths(sid)
@@ -204,45 +239,40 @@ def ping():
204
  @app.route("/api/context")
205
  def api_context():
206
  """
207
- Return an excerpt of the merged paragraphs centered on the requested paragraph index.
208
- Query params: sid, idx (int), window (int, optional, default 3) – number of surrounding paras on each side.
 
209
  """
210
  sid = request.args.get("sid")
 
 
 
211
  try:
212
  idx = int(request.args.get("idx", "-1"))
213
  except (TypeError, ValueError):
214
  return jsonify(error="Bad idx"), 400
215
 
216
- try:
217
- window = int(request.args.get("window", "3"))
218
- except (TypeError, ValueError):
219
- window = 3
220
-
221
- if not sid or idx < 0:
222
- return jsonify(error="Missing sid or idx"), 400
223
-
224
- paragraphs, _, _ = index_data.get(sid, (None, None, None))
225
- if paragraphs is None:
226
  return jsonify(error="No index for this sid. Upload files first."), 404
227
- if idx >= len(paragraphs):
 
228
  return jsonify(error="idx out of range"), 400
229
 
230
- start = max(0, idx - window)
231
- end = min(len(paragraphs), idx + window + 1)
232
- context_paras = paragraphs[start:end]
233
- center_local = idx - start # where the highlighted paragraph sits in that slice
234
 
 
 
 
 
235
  return jsonify(
236
- paras=context_paras,
237
- center=center_local,
238
  start=start,
239
  end=end,
240
- total=len(paragraphs)
241
  )
242
 
243
- #if __name__ == "__main__":
244
- # from waitress import serve
245
- # serve(app, host="0.0.0.0", port=9001, threads=4)
246
-
247
  if __name__ == "__main__":
248
  app.run(host="0.0.0.0", port=7860)
 
5
  import numpy as np
6
  import faiss
7
  from sentence_transformers import SentenceTransformer
 
8
  from PyPDF2 import PdfReader
9
 
10
+ # Optional: if you had an NLI pipeline before, you can keep or remove it.
11
+ try:
12
+ from transformers import pipeline as hf_pipeline
13
+ nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
14
+ print("βœ… NLI pipeline loaded.")
15
+ except Exception as e:
16
+ nli = None
17
+ print("ℹ️ NLI pipeline not loaded (optional):", e)
18
+
19
  print("⏳ Loading SentenceTransformer model...")
20
  model = SentenceTransformer('all-MiniLM-L6-v2')
21
+ print("βœ… Encoder loaded.")
 
 
 
 
22
 
23
  app = Flask(__name__)
24
 
25
  # ── base folders ───────────────────────────────────────────────────────────────
26
+ BASE_DIR = os.path.dirname(__file__)
27
+ BASE_UPLOADS = os.path.join(BASE_DIR, "uploads")
28
+ BASE_RESULTS = os.path.join(BASE_DIR, "results")
29
  os.makedirs(BASE_UPLOADS, exist_ok=True)
30
  os.makedirs(BASE_RESULTS, exist_ok=True)
31
 
 
43
  os.rmdir(path)
44
  else:
45
  os.remove(path)
46
+
47
  clear_uploads_folder()
48
  print("βœ… Uploads folder cleared.")
49
 
50
+ # runtime cache keyed by search-id β†’ (paragraphs_norm, embeddings, faiss-index, spans)
51
+ # spans[i] = (start_char, end_char) in the full merged text for paragraph i
52
  index_data = {}
53
 
54
  # ── helpers ────────────────────────────────────────────────────────────────────
 
62
  result_file = os.path.join(res_folder, "results.txt")
63
  return up_folder, res_folder, merged_file, result_file
64
 
65
+ def extract_text(file_path: str) -> str:
66
+ if file_path.lower().endswith('.txt'):
67
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
68
  return f.read()
69
+ if file_path.lower().endswith('.pdf'):
70
  reader = PdfReader(file_path)
71
+ chunks = []
72
+ for page in reader.pages:
73
+ t = page.extract_text() or ""
74
+ chunks.append(t)
75
+ # Light cleanup: tighten excessive gaps, add blank lines between sentences
76
+ full_text = " ".join(chunks)
77
  full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
 
78
  full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
79
  return full_text
80
  return ""
81
 
82
+ def split_paragraphs_with_spans(merged_text: str):
83
+ """
84
+ Split merged_text into logical 'paragraphs' based on blank lines,
85
+ returning normalized paragraphs for embedding AND exact spans (start,end)
86
+ in the original merged_text for highlighting/jumping.
87
+ """
88
+ sep = re.compile(r'(?:\r?\n\s*\r?\n)+', flags=re.MULTILINE)
89
+ paras_norm = []
90
+ spans = []
91
+ pos = 0
92
+ for m in sep.finditer(merged_text):
93
+ seg = merged_text[pos:m.start()]
94
+ norm = re.sub(r'\s+', ' ', seg).strip()
95
+ if len(norm.split()) > 4: # keep only substantive chunks
96
+ paras_norm.append(norm)
97
+ spans.append((pos, m.start()))
98
+ pos = m.end()
99
+ # Tail
100
+ seg = merged_text[pos:]
101
+ norm = re.sub(r'\s+', ' ', seg).strip()
102
+ if len(norm.split()) > 4:
103
+ paras_norm.append(norm)
104
+ spans.append((pos, len(merged_text)))
105
+ return paras_norm, spans
106
+
107
  def rebuild_merged_and_index(sid: str):
108
+ """Build merged.txt, paragraph embeddings, and spans for *this* search id."""
109
+ up_folder, res_folder, merged_file, _ = get_paths(sid)
110
 
111
  merged_text = ""
112
+ for filename in sorted(os.listdir(up_folder)):
113
  if filename.lower().endswith((".pdf", ".txt")):
114
+ merged_text += extract_text(os.path.join(up_folder, filename)).rstrip() + "\n\n"
115
 
116
  with open(merged_file, "w", encoding='utf-8') as f:
117
  f.write(merged_text)
118
 
119
+ paras_norm, spans = split_paragraphs_with_spans(merged_text)
120
+
121
+ if not paras_norm:
122
+ index_data[sid] = ([], None, None, [])
123
  return
124
 
125
+ embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
126
  embed = np.asarray(embed)
127
  if embed.ndim == 1:
128
  embed = embed[np.newaxis, :]
 
130
  idx = faiss.IndexFlatIP(embed.shape[1])
131
  idx.add(embed)
132
 
133
+ index_data[sid] = (paras_norm, embed, idx, spans)
134
 
135
  # ── routes ─────────────────────────────────────────────────────────────────────
136
  @app.route("/", methods=["GET", "POST"])
 
140
  sid = str(uuid.uuid4())
141
 
142
  up_folder, _, _, _ = get_paths(sid)
143
+ paragraphs, embeddings, index_faiss, spans = index_data.get(sid, ([], None, None, []))
144
 
145
  uploaded_filenames = sorted(os.listdir(up_folder))
146
 
 
163
  faiss.normalize_L2(q_embed)
164
  D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
165
 
166
+ # Keep the FAISS paragraph index so the UI can jump within full text.
167
  results = [{"idx": int(i), "text": paragraphs[i]} for i in I[0]]
168
 
169
  _, res_folder, _, result_file = get_paths(sid)
 
239
  @app.route("/api/context")
240
  def api_context():
241
  """
242
+ Return FULL merged text plus the exact character span for the requested paragraph.
243
+ Query params: sid, idx (int)
244
+ Response: { merged: str, start: int, end: int, total_len: int }
245
  """
246
  sid = request.args.get("sid")
247
+ if not sid:
248
+ return jsonify(error="Missing sid"), 400
249
+
250
  try:
251
  idx = int(request.args.get("idx", "-1"))
252
  except (TypeError, ValueError):
253
  return jsonify(error="Bad idx"), 400
254
 
255
+ paragraphs, _, _, spans = index_data.get(sid, (None, None, None, None))
256
+ if paragraphs is None or spans is None:
 
 
 
 
 
 
 
 
257
  return jsonify(error="No index for this sid. Upload files first."), 404
258
+
259
+ if not (0 <= idx < len(spans)):
260
  return jsonify(error="idx out of range"), 400
261
 
262
+ _, _, merged_file, _ = get_paths(sid)
263
+ if not os.path.exists(merged_file):
264
+ return jsonify(error="merged.txt not found"), 404
 
265
 
266
+ with open(merged_file, "r", encoding="utf-8") as f:
267
+ merged_text = f.read()
268
+
269
+ start, end = spans[idx]
270
  return jsonify(
271
+ merged=merged_text,
 
272
  start=start,
273
  end=end,
274
+ total_len=len(merged_text)
275
  )
276
 
 
 
 
 
277
  if __name__ == "__main__":
278
  app.run(host="0.0.0.0", port=7860)