Spaces:
Sleeping
Sleeping
Commit
·
2341eb2
1
Parent(s):
9d99321
Removed the logging code
Browse files
app.py
CHANGED
|
@@ -26,59 +26,6 @@ vector_db = VectorStore()
|
|
| 26 |
topic_modeller = TopicModeller()
|
| 27 |
|
| 28 |
|
| 29 |
-
def print_recent_logs(n: int = 5):
|
| 30 |
-
"""
|
| 31 |
-
Print the last N log lines to the container logs for developer monitoring.
|
| 32 |
-
"""
|
| 33 |
-
log_file = "semanticdala_log.csv"
|
| 34 |
-
|
| 35 |
-
if os.path.exists(log_file):
|
| 36 |
-
print(f"\n[SEMANTICDALA USAGE LOG - Last {n} Entries]")
|
| 37 |
-
|
| 38 |
-
with open(log_file, "r") as f:
|
| 39 |
-
lines = f.readlines()
|
| 40 |
-
|
| 41 |
-
for line in lines[-n:]:
|
| 42 |
-
print(line.strip())
|
| 43 |
-
|
| 44 |
-
print("[END LOG SNAPSHOT]\n")
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def log_submission(filename: str, num_chunks: int, start_time: float, status: str, session_id: str = "anonymous") -> None:
|
| 48 |
-
"""
|
| 49 |
-
Basic logging utility to keep track of app usage.
|
| 50 |
-
"""
|
| 51 |
-
log_file = "semanticdala_log.csv"
|
| 52 |
-
end_time = time.time()
|
| 53 |
-
duration = round(end_time - start_time, 2)
|
| 54 |
-
|
| 55 |
-
# Anonymise filename for privacy
|
| 56 |
-
anonymized_name = hashlib.sha256(filename.encode()).hexdigest()[:10]
|
| 57 |
-
|
| 58 |
-
# Get file size in bytes
|
| 59 |
-
file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
|
| 60 |
-
file_size_mb = round(file_size / (1024 * 1024), 2)
|
| 61 |
-
|
| 62 |
-
log_entry = {
|
| 63 |
-
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
| 64 |
-
"filename_hash": anonymized_name,
|
| 65 |
-
"file_size_mb": file_size_mb,
|
| 66 |
-
"num_chunks": num_chunks,
|
| 67 |
-
"processing_time_sec": duration,
|
| 68 |
-
"status": status,
|
| 69 |
-
"session_id": session_id
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
file_exists = os.path.isfile(log_file)
|
| 73 |
-
|
| 74 |
-
with open(log_file, mode = 'a', newline = "") as f:
|
| 75 |
-
writer = csv.DictWriter(f, fieldnames = log_entry.keys())
|
| 76 |
-
|
| 77 |
-
if not file_exists:
|
| 78 |
-
writer.writeheader()
|
| 79 |
-
|
| 80 |
-
writer.writerow(log_entry)
|
| 81 |
-
|
| 82 |
|
| 83 |
def extract_text(file: Any) -> str:
|
| 84 |
"""
|
|
@@ -105,45 +52,32 @@ def process_file(file: Any) -> Tuple[List[Tuple[str, int]], Any, Any]:
|
|
| 105 |
Main file processing function, which will also chunk, transliterate and cluster
|
| 106 |
the file contents, as well as plot the clusters.
|
| 107 |
"""
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
embeddings = embedder.embed_batch(dedup_translits)
|
| 118 |
|
| 119 |
-
|
| 120 |
-
vector_db.index.reset()
|
| 121 |
-
vector_db.metadata = []
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
vector_db.add(embeddings, metadata)
|
| 126 |
-
|
| 127 |
-
# Topic modelling
|
| 128 |
-
topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
|
| 129 |
-
|
| 130 |
-
# Get a list of rows for topic labels
|
| 131 |
-
overview_table = [[k, v] for k, v in topic_labels.items()]
|
| 132 |
-
|
| 133 |
-
# Zip back transliterated text with topic IDs
|
| 134 |
-
annotated = list(zip(dedup_translits, topics))
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
print_recent_logs()
|
| 139 |
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
print_recent_logs()
|
| 145 |
|
| 146 |
-
|
| 147 |
|
| 148 |
|
| 149 |
def search_text(query: str):
|
|
|
|
| 26 |
topic_modeller = TopicModeller()
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def extract_text(file: Any) -> str:
|
| 31 |
"""
|
|
|
|
| 52 |
Main file processing function, which will also chunk, transliterate and cluster
|
| 53 |
the file contents, as well as plot the clusters.
|
| 54 |
"""
|
| 55 |
+
raw_text = extract_text(file)
|
| 56 |
+
chunks = chunk_text(raw_text)
|
| 57 |
|
| 58 |
+
# Deduplicate and embed embedding
|
| 59 |
+
translits = translit.batch_transliterate(chunks)
|
| 60 |
+
dedup_translits = deduplicate_chunks(translits, embedder)
|
| 61 |
+
embeddings = embedder.embed_batch(dedup_translits)
|
| 62 |
|
| 63 |
+
# Clear previous entries before adding
|
| 64 |
+
vector_db.index.reset()
|
| 65 |
+
vector_db.metadata = []
|
|
|
|
| 66 |
|
| 67 |
+
metadata = [{"id": f"{file.name}_chunk{i}", "text": t} for i, t in enumerate(dedup_translits)]
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
vector_db.add(embeddings, metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# Topic modelling
|
| 72 |
+
topics, fig, topic_labels, umap_fig = topic_modeller.fit(dedup_translits, embeddings)
|
|
|
|
| 73 |
|
| 74 |
+
# Get a list of rows for topic labels
|
| 75 |
+
overview_table = [[k, v] for k, v in topic_labels.items()]
|
| 76 |
|
| 77 |
+
# Zip back transliterated text with topic IDs
|
| 78 |
+
annotated = list(zip(dedup_translits, topics))
|
|
|
|
| 79 |
|
| 80 |
+
return annotated, fig, overview_table, umap_fig
|
| 81 |
|
| 82 |
|
| 83 |
def search_text(query: str):
|