ntdservices commited on
Commit
9ef7cd9
Β·
verified Β·
1 Parent(s): 366abe4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -25
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
2
  import os
3
  import re
@@ -7,7 +8,7 @@ import faiss
7
  from sentence_transformers import SentenceTransformer
8
  from PyPDF2 import PdfReader
9
 
10
- # Optional: if you had an NLI pipeline before, you can keep or remove it.
11
  try:
12
  from transformers import pipeline as hf_pipeline
13
  nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
@@ -47,8 +48,11 @@ def clear_uploads_folder():
47
  clear_uploads_folder()
48
  print("βœ… Uploads folder cleared.")
49
 
50
- # runtime cache keyed by search-id β†’ (paragraphs_norm, embeddings, faiss-index, spans)
51
- # spans[i] = (start_char, end_char) in the full merged text for paragraph i
 
 
 
52
  index_data = {}
53
 
54
  # ── helpers ────────────────────────────────────────────────────────────────────
@@ -62,22 +66,45 @@ def get_paths(sid: str):
62
  result_file = os.path.join(res_folder, "results.txt")
63
  return up_folder, res_folder, merged_file, result_file
64
 
65
- def extract_text(file_path: str) -> str:
66
- if file_path.lower().endswith('.txt'):
67
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
68
- return f.read()
69
- if file_path.lower().endswith('.pdf'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  reader = PdfReader(file_path)
71
- chunks = []
72
  for page in reader.pages:
73
  t = page.extract_text() or ""
74
- chunks.append(t)
75
- # Light cleanup: tighten excessive gaps, add blank lines between sentences
76
- full_text = " ".join(chunks)
 
 
 
77
  full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
78
  full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
79
- return full_text
80
- return ""
 
81
 
82
  def split_paragraphs_with_spans(merged_text: str):
83
  """
@@ -105,23 +132,47 @@ def split_paragraphs_with_spans(merged_text: str):
105
  return paras_norm, spans
106
 
107
  def rebuild_merged_and_index(sid: str):
108
- """Build merged.txt, paragraph embeddings, and spans for *this* search id."""
109
  up_folder, res_folder, merged_file, _ = get_paths(sid)
110
 
111
  merged_text = ""
 
 
112
  for filename in sorted(os.listdir(up_folder)):
113
- if filename.lower().endswith((".pdf", ".txt")):
114
- merged_text += extract_text(os.path.join(up_folder, filename)).rstrip() + "\n\n"
 
 
 
 
 
 
 
 
 
115
 
116
  with open(merged_file, "w", encoding='utf-8') as f:
117
  f.write(merged_text)
118
 
119
  paras_norm, spans = split_paragraphs_with_spans(merged_text)
120
-
121
  if not paras_norm:
122
- index_data[sid] = ([], None, None, [])
123
  return
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
126
  embed = np.asarray(embed)
127
  if embed.ndim == 1:
@@ -130,7 +181,7 @@ def rebuild_merged_and_index(sid: str):
130
  idx = faiss.IndexFlatIP(embed.shape[1])
131
  idx.add(embed)
132
 
133
- index_data[sid] = (paras_norm, embed, idx, spans)
134
 
135
  # ── routes ─────────────────────────────────────────────────────────────────────
136
  @app.route("/", methods=["GET", "POST"])
@@ -140,7 +191,9 @@ def index():
140
  sid = str(uuid.uuid4())
141
 
142
  up_folder, _, _, _ = get_paths(sid)
143
- paragraphs, embeddings, index_faiss, spans = index_data.get(sid, ([], None, None, []))
 
 
144
 
145
  uploaded_filenames = sorted(os.listdir(up_folder))
146
 
@@ -163,8 +216,18 @@ def index():
163
  faiss.normalize_L2(q_embed)
164
  D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
165
 
166
- # Keep the FAISS paragraph index so the UI can jump within full text.
167
- results = [{"idx": int(i), "text": paragraphs[i]} for i in I[0]]
 
 
 
 
 
 
 
 
 
 
168
 
169
  _, res_folder, _, result_file = get_paths(sid)
170
  with open(result_file, "w", encoding='utf-8') as f:
@@ -252,7 +315,7 @@ def api_context():
252
  except (TypeError, ValueError):
253
  return jsonify(error="Bad idx"), 400
254
 
255
- paragraphs, _, _, spans = index_data.get(sid, (None, None, None, None))
256
  if paragraphs is None or spans is None:
257
  return jsonify(error="No index for this sid. Upload files first."), 404
258
 
@@ -275,4 +338,4 @@ def api_context():
275
  )
276
 
277
  if __name__ == "__main__":
278
- app.run(host="0.0.0.0", port=7860)
 
1
+ # app.py
2
  from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
3
  import os
4
  import re
 
8
  from sentence_transformers import SentenceTransformer
9
  from PyPDF2 import PdfReader
10
 
11
+ # Optional NLI (not required for this feature)
12
  try:
13
  from transformers import pipeline as hf_pipeline
14
  nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
 
48
  clear_uploads_folder()
49
  print("βœ… Uploads folder cleared.")
50
 
51
+ # runtime cache keyed by search-id:
52
+ # (paragraphs_norm, embeddings, faiss-index, spans, para_file_idx, file_meta)
53
+ # spans[i] = (start_char, end_char) of paragraph i within merged.txt
54
+ # para_file_idx[i] = index into file_meta for paragraph i
55
+ # file_meta[j] = {"name": filename, "start": start_char, "end": end_char, "second_line": str}
56
  index_data = {}
57
 
58
  # ── helpers ────────────────────────────────────────────────────────────────────
 
66
  result_file = os.path.join(res_folder, "results.txt")
67
  return up_folder, res_folder, merged_file, result_file
68
 
69
+ def compute_second_line(raw_text: str) -> str:
70
+ """Return the 2nd non-empty line if available, else the literal 2nd line, else ''."""
71
+ lines = raw_text.splitlines()
72
+ non_empty = [ln.strip() for ln in lines if ln.strip() != ""]
73
+ if len(non_empty) >= 2:
74
+ return non_empty[1]
75
+ if len(lines) >= 2:
76
+ return lines[1].strip()
77
+ return ""
78
+
79
+ def extract_for_merge_and_second(file_path: str):
80
+ """
81
+ Return a tuple (merged_text_piece, second_line_str) for a single file.
82
+ - For .txt: merged part is raw file text.
83
+ - For .pdf: merged part is lightly cleaned text; second line is computed
84
+ using raw extracted line structure as well.
85
+ """
86
+ if file_path.lower().endswith(".txt"):
87
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
88
+ raw = f.read()
89
+ second = compute_second_line(raw)
90
+ return raw, second
91
+
92
+ if file_path.lower().endswith(".pdf"):
93
  reader = PdfReader(file_path)
94
+ pages = []
95
  for page in reader.pages:
96
  t = page.extract_text() or ""
97
+ pages.append(t)
98
+ raw_lines_joined = "\n".join(pages) # preserve some line structure for 2nd line
99
+ second = compute_second_line(raw_lines_joined)
100
+
101
+ # Light cleanup for the merged view
102
+ full_text = " ".join(pages)
103
  full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
104
  full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
105
+ return full_text, second
106
+
107
+ return "", ""
108
 
109
  def split_paragraphs_with_spans(merged_text: str):
110
  """
 
132
  return paras_norm, spans
133
 
134
  def rebuild_merged_and_index(sid: str):
135
+ """Build merged.txt, paragraph embeddings, spans, and per-paragraph file mapping."""
136
  up_folder, res_folder, merged_file, _ = get_paths(sid)
137
 
138
  merged_text = ""
139
+ file_meta = [] # list of dicts with name, start, end, second_line
140
+ # Append files in sorted order for stability
141
  for filename in sorted(os.listdir(up_folder)):
142
+ if not filename.lower().endswith((".pdf", ".txt")):
143
+ continue
144
+ file_path = os.path.join(up_folder, filename)
145
+ part, second = extract_for_merge_and_second(file_path)
146
+ part = part.rstrip()
147
+ if not part:
148
+ continue
149
+ start = len(merged_text)
150
+ merged_text += part + "\n\n" # add separator after each file
151
+ end = start + len(part) # char range covering the file's text (exclude our extra \n\n)
152
+ file_meta.append({"name": filename, "start": start, "end": end, "second_line": second})
153
 
154
  with open(merged_file, "w", encoding='utf-8') as f:
155
  f.write(merged_text)
156
 
157
  paras_norm, spans = split_paragraphs_with_spans(merged_text)
 
158
  if not paras_norm:
159
+ index_data[sid] = ([], None, None, [], [], [])
160
  return
161
 
162
+ # Map each paragraph span to its originating file via start position
163
+ para_file_idx = []
164
+ for (pstart, _pend) in spans:
165
+ assigned = None
166
+ for j, meta in enumerate(file_meta):
167
+ next_start = file_meta[j+1]["start"] if j + 1 < len(file_meta) else float("inf")
168
+ if pstart >= meta["start"] and pstart < next_start:
169
+ assigned = j
170
+ break
171
+ if assigned is None:
172
+ assigned = max(0, len(file_meta)-1)
173
+ para_file_idx.append(assigned)
174
+
175
+ # Build embeddings + FAISS
176
  embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
177
  embed = np.asarray(embed)
178
  if embed.ndim == 1:
 
181
  idx = faiss.IndexFlatIP(embed.shape[1])
182
  idx.add(embed)
183
 
184
+ index_data[sid] = (paras_norm, embed, idx, spans, para_file_idx, file_meta)
185
 
186
  # ── routes ─────────────────────────────────────────────────────────────────────
187
  @app.route("/", methods=["GET", "POST"])
 
191
  sid = str(uuid.uuid4())
192
 
193
  up_folder, _, _, _ = get_paths(sid)
194
+ paragraphs, embeddings, index_faiss, spans, para_file_idx, file_meta = index_data.get(
195
+ sid, ([], None, None, [], [], [])
196
+ )
197
 
198
  uploaded_filenames = sorted(os.listdir(up_folder))
199
 
 
216
  faiss.normalize_L2(q_embed)
217
  D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
218
 
219
+ # Build result objects with file name + the file's 2nd line
220
+ for i in I[0]:
221
+ i = int(i)
222
+ file_idx = para_file_idx[i] if 0 <= i < len(para_file_idx) else -1
223
+ fname = file_meta[file_idx]["name"] if 0 <= file_idx < len(file_meta) else "unknown"
224
+ second = file_meta[file_idx]["second_line"] if 0 <= file_idx < len(file_meta) else ""
225
+ results.append({
226
+ "idx": i,
227
+ "text": paragraphs[i],
228
+ "file": fname,
229
+ "second_line": second
230
+ })
231
 
232
  _, res_folder, _, result_file = get_paths(sid)
233
  with open(result_file, "w", encoding='utf-8') as f:
 
315
  except (TypeError, ValueError):
316
  return jsonify(error="Bad idx"), 400
317
 
318
+ paragraphs, _, _, spans, _, _ = index_data.get(sid, (None, None, None, None, None, None))
319
  if paragraphs is None or spans is None:
320
  return jsonify(error="No index for this sid. Upload files first."), 404
321
 
 
338
  )
339
 
340
  if __name__ == "__main__":
341
+ app.run(host="0.0.0.0", port=7860)