Muhammed Essam commited on
Commit
8ef276c
·
1 Parent(s): eee15d4

Initial commit: Voice Assistant demo

Browse files
.gitattributes CHANGED
@@ -1,35 +1,29 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
 
 
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ # Git LFS attributes for Hugging Face Spaces
2
+ # Track large files with Git LFS
3
+
4
+ # Model files
5
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  *.pt filter=lfs diff=lfs merge=lfs -text
7
  *.pth filter=lfs diff=lfs merge=lfs -text
 
8
  *.safetensors filter=lfs diff=lfs merge=lfs -text
9
+ *.h5 filter=lfs diff=lfs merge=lfs -text
10
+ *.pb filter=lfs diff=lfs merge=lfs -text
11
+ *.onnx filter=lfs diff=lfs merge=lfs -text
12
+
13
+ # Pickle files (model weights)
14
+ *.pkl filter=lfs diff=lfs merge=lfs -text
15
+ *.pickle filter=lfs diff=lfs merge=lfs -text
16
+
17
+ # Audio files (if you want to include examples)
18
+ *.wav filter=lfs diff=lfs merge=lfs -text
19
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
20
+ *.ogg filter=lfs diff=lfs merge=lfs -text
21
+ *.flac filter=lfs diff=lfs merge=lfs -text
22
+
23
+ # Archive files
24
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+
27
+ # Data files
28
+ *.arrow filter=lfs diff=lfs merge=lfs -text
29
+ *.parquet filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,163 @@
1
  ---
2
- title: Contact Search Assistant
3
- emoji: 📈
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.0.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Voice Assistant - Multi-language Division Matching & Contact Search
3
+ emoji: 🎙️
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # 🎙️ Voice Assistant Demo
14
+
15
+ A powerful multi-language voice assistant that helps users find divisions and contacts within an organization using natural language queries.
16
+
17
+ ## 🌟 Features
18
+
19
+ ### 🗣️ Multi-language Voice Input
20
+ - **99+ languages** supported (auto-detected)
21
+ - Automatic speech-to-text using OpenAI Whisper
22
+ - Arabic-to-English translation for seamless processing
23
+ - Works with various audio formats
24
+
25
+ ### 🎯 Smart Division Matching
26
+ - Semantic search using sentence embeddings
27
+ - Confidence-based routing with intelligent thresholds
28
+ - Department-level expansion (searches all divisions in a department)
29
+ - Fast matching (~50ms) using `all-MiniLM-L6-v2`
30
+
31
+ ### 👤 Name Extraction
32
+ - Extracts person names from queries using GLiNER
33
+ - Supports English and Arabic names
34
+ - Zero-shot NER for robust extraction
35
+
36
+ ### 📞 Contact Search
37
+ - 500+ contacts across 23 departments and 67 divisions
38
+ - Intelligent matching combining name and division
39
+ - Confidence scoring with match reasoning
40
+ - Fuzzy name matching for typos and variations
41
+
42
+ ## 🚀 How to Use
43
+
44
+ ### Division Matching (Text)
45
+ Find the right division for your query:
46
+ ```
47
+ "I need help from IT Security"
48
+ "Find someone in Finance"
49
+ "Connect me to Human Resources"
50
+ ```
51
+
52
+ ### Division Matching (Voice)
53
+ Speak your query in any language - it will be transcribed and processed automatically.
54
+
55
+ ### Contact Search (Text)
56
+ Search for specific people or teams:
57
+ ```
58
+ "Find Dima in Information Technology"
59
+ "Ahmed Al-Malek"
60
+ "I need to talk to someone in Legal"
61
+ ```
62
+
63
+ ### Contact Search (Voice)
64
+ Speak your contact search query in any language.
65
+
66
+ ## 📊 Example Queries
67
+
68
+ ### Department-Level Queries
69
+ These queries search across ALL divisions in a department:
70
+ - ✅ "Find someone in Information Technology" → Searches 8 IT divisions
71
+ - ✅ "I need help from Finance" → Searches all Finance divisions
72
+ - ✅ "Connect me to Human Resources" → Searches all HR divisions
73
+
74
+ ### Division-Level Queries
75
+ These match specific divisions:
76
+ - ✅ "Find Ahmed in App Dev" → Applications Development & Integrations
77
+ - ✅ "I need help from IT Security" → IT Security Implementation & Operations
78
+ - ✅ "Connect me to Legal" → Legal divisions
79
+
80
+ ### Name-Only Queries
81
+ - ✅ "Find Dima" → Searches all contacts named Dima
82
+ - ✅ "Ahmed Al-Malek" → Exact name match
83
+ - ✅ "I need to talk to Rashed" → Fuzzy name matching
84
+
85
+ ### Combined Queries (Name + Department/Division)
86
+ Priority given to division accuracy:
87
+ - ✅ "Find Dima in Information Technology" → Perfect match (confidence: 1.00)
88
+ - ✅ "Find Ahmed in App Dev" → Shows App Dev team members
89
+
90
+ ## 🔧 Technical Details
91
+
92
+ ### Models Used
93
+ - **Embeddings**: `sentence-transformers/all-MiniLM-L6-v2` - Fast, lightweight semantic search
94
+ - **Name Extraction**: `urchade/gliner_small-v2.1` - Zero-shot NER for person names
95
+ - **Speech-to-Text**: `openai/whisper-tiny` - Optimized for speed on CPU
96
+
97
+ ### Confidence Scoring
98
+
99
+ | Score | Meaning | Example |
100
+ |-------|---------|---------|
101
+ | **1.00** | Perfect match (name + division) | Dima in IT |
102
+ | **0.95** | Exact name match | Ahmed Al-Malek |
103
+ | **0.66** | Strong division match | People in requested division |
104
+ | **0.59** | Good division match | Close division match |
105
+ | **< 0.30** | Low confidence | Wrong division penalty |
106
+
107
+ ### Match Reasons
108
+ - `name_and_division_match` - Both name AND division match ✅
109
+ - `division_match` - Division/department matches (no name match)
110
+ - `exact_name_match` - Exact name match (100%)
111
+ - `fuzzy_name_match` - Partial name match (75%+)
112
+ - `name_match_wrong_division` - Name matches but WRONG division ⚠️
113
+
114
+ ## 📦 Database Stats
115
+ - **500 contacts** across the organization
116
+ - **23 departments** (Information Technology, Finance, HR, etc.)
117
+ - **67 divisions** (specific teams and units)
118
+ - **Multi-language support** (English + Arabic names)
119
+
120
+ ## 🌍 Supported Languages
121
+
122
+ The voice assistant supports **99+ languages** including:
123
+ - English
124
+ - Arabic (العربية)
125
+ - Spanish, French, German, Italian
126
+ - Chinese (中文), Japanese (日本語), Korean (한국어)
127
+ - Hindi, Urdu, Bengali
128
+ - And many more...
129
+
130
+ Language is automatically detected - just speak naturally!
131
+
132
+ ## ⚡ Performance
133
+
134
+ - **Division Matching**: ~50ms per query
135
+ - **Name Extraction**: ~100-200ms per query
136
+ - **Voice Processing**: ~1-3 seconds (depends on audio length)
137
+ - **Contact Search**: ~100-300ms per query
138
+
139
+ ## 🛠️ Built With
140
+
141
+ - **Gradio** - Interactive web interface
142
+ - **FastAPI** - Backend API (original implementation)
143
+ - **Sentence Transformers** - Semantic search
144
+ - **OpenAI Whisper** - Speech recognition
145
+ - **GLiNER** - Named Entity Recognition
146
+ - **PyTorch** - Deep learning framework
147
+
148
+ ## 📝 License
149
+
150
+ MIT License
151
+
152
+ ## 🙏 Acknowledgments
153
+
154
+ - OpenAI for Whisper
155
+ - Hugging Face for model hosting
156
+ - URCHADE for GLiNER
157
+ - Sentence Transformers team
158
+
159
+ ---
160
+
161
+ **Version:** 4.0.0
162
+ **Status:** ✅ Production Ready
163
+ **Demo Type:** Interactive Gradio Demo
app.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Spaces Demo - Voice Assistant API
3
+ Multi-language voice assistant with division matching and contact search
4
+ """
5
+
6
+ import gradio as gr
7
+ import logging
8
+ from typing import Optional, Tuple
9
+ import numpy as np
10
+
11
+ # Import existing services
12
+ from embedding_service import EmbeddingService
13
+ from name_extraction_service import NameExtractor
14
+ from voice_processing_service import VoiceProcessor
15
+ from contact_search_service import ContactSearchService
16
+
17
+ # Set up logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Global services (initialized once)
22
+ embedding_service: Optional[EmbeddingService] = None
23
+ name_extractor: Optional[NameExtractor] = None
24
+ voice_processor: Optional[VoiceProcessor] = None
25
+ contact_search_service: Optional[ContactSearchService] = None
26
+
27
+
28
+ def initialize_services():
29
+ """Initialize all AI services (called once on startup)"""
30
+ global embedding_service, name_extractor, voice_processor, contact_search_service
31
+
32
+ logger.info("🚀 Initializing services...")
33
+
34
+ # Initialize embedding service (fast & lightweight)
35
+ logger.info("Loading embedding model...")
36
+ embedding_service = EmbeddingService(model_name="all-MiniLM-L6-v2")
37
+ logger.info("✓ Embedding service ready!")
38
+
39
+ # Initialize name extractor
40
+ logger.info("Loading name extraction model...")
41
+ name_extractor = NameExtractor(model_name="urchade/gliner_small-v2.1")
42
+ logger.info("✓ Name extractor ready!")
43
+
44
+ # Initialize voice processor (using tiny model for HF Spaces)
45
+ logger.info("Loading Whisper model...")
46
+ voice_processor = VoiceProcessor(model_size="tiny") # Using tiny for faster inference
47
+ logger.info("✓ Voice processor ready!")
48
+
49
+ # Initialize contact search
50
+ logger.info("Loading contact database...")
51
+ contact_search_service = ContactSearchService(name_extractor, embedding_service)
52
+ stats = contact_search_service.get_contact_stats()
53
+ logger.info(f"✓ Loaded {stats['total_contacts']} contacts across {stats['divisions']} divisions")
54
+
55
+ return stats
56
+
57
+
58
+ def format_division_matches(matches, names):
59
+ """Format division matching results for display"""
60
+ if not matches:
61
+ return "No matches found."
62
+
63
+ output = []
64
+
65
+ if names:
66
+ output.append(f"**Extracted Names:** {', '.join(names)}\n")
67
+
68
+ output.append("### 🎯 Division Matches:\n")
69
+
70
+ for i, match in enumerate(matches[:3], 1):
71
+ confidence_pct = match.confidence * 100
72
+ confidence_bar = "🟢" * int(confidence_pct / 20) + "⚪" * (5 - int(confidence_pct / 20))
73
+
74
+ output.append(f"**{i}. {match.division}**")
75
+ output.append(f" - Confidence: {confidence_pct:.1f}% {confidence_bar}")
76
+ output.append(f" - Department: {match.department}")
77
+ output.append(f" - Keywords: {', '.join(match.keywords[:3])}")
78
+ output.append("")
79
+
80
+ return "\n".join(output)
81
+
82
+
83
+ def format_contact_results(contacts, extracted_names, matched_divisions):
84
+ """Format contact search results for display"""
85
+ if not contacts:
86
+ return "No contacts found."
87
+
88
+ output = []
89
+
90
+ if extracted_names:
91
+ output.append(f"**Extracted Names:** {', '.join(extracted_names)}\n")
92
+
93
+ if matched_divisions:
94
+ output.append(f"**Matched Divisions:** {', '.join(matched_divisions[:3])}\n")
95
+
96
+ output.append(f"### 👥 Found {len(contacts)} Contact(s):\n")
97
+
98
+ for i, contact in enumerate(contacts[:10], 1):
99
+ confidence_pct = contact['confidence'] * 100
100
+ confidence_bar = "🟢" * int(confidence_pct / 20) + "⚪" * (5 - int(confidence_pct / 20))
101
+
102
+ output.append(f"**{i}. {contact['name']}**")
103
+ output.append(f" - Position: {contact['position']}")
104
+ output.append(f" - Division: {contact['division']}")
105
+ output.append(f" - Department: {contact['department']}")
106
+ output.append(f" - Phone: {contact['phone']}")
107
+ output.append(f" - Email: {contact['email']}")
108
+ output.append(f" - Confidence: {confidence_pct:.1f}% {confidence_bar}")
109
+ output.append(f" - Match Reason: {contact['match_reason']}")
110
+ output.append("")
111
+
112
+ return "\n".join(output)
113
+
114
+
115
+ def search_divisions_text(query: str) -> str:
116
+ """Search for divisions based on text query"""
117
+ if not query or not query.strip():
118
+ return "Please enter a query."
119
+
120
+ try:
121
+ # Extract names
122
+ names = name_extractor.extract_names(query)
123
+
124
+ # Find matching divisions
125
+ matches = embedding_service.find_division(query, top_k=3)
126
+
127
+ return format_division_matches(matches, names)
128
+
129
+ except Exception as e:
130
+ logger.error(f"Error in division search: {e}")
131
+ return f"Error: {str(e)}"
132
+
133
+
134
+ def search_divisions_voice(audio: Optional[Tuple[int, np.ndarray]]) -> str:
135
+ """Search for divisions based on voice query"""
136
+ if audio is None:
137
+ return "Please record audio first."
138
+
139
+ try:
140
+ # Save audio to temporary file
141
+ sample_rate, audio_data = audio
142
+ temp_path = voice_processor.save_audio_array(audio_data, sample_rate)
143
+
144
+ # Process voice query
145
+ voice_result = voice_processor.process_voice_query(temp_path)
146
+ query = voice_result['query']
147
+
148
+ # Extract names
149
+ names = name_extractor.extract_names(query)
150
+
151
+ # Find matching divisions
152
+ matches = embedding_service.find_division(query, top_k=3)
153
+
154
+ # Format output
155
+ output = []
156
+ output.append(f"**🎤 Transcribed Text:** {query}")
157
+ output.append(f"**🌍 Language:** {voice_result['language_name']}")
158
+ if voice_result['was_translated']:
159
+ output.append(f"**📝 Original:** {voice_result['original_text']}")
160
+ output.append("")
161
+ output.append(format_division_matches(matches, names))
162
+
163
+ # Cleanup
164
+ voice_processor.cleanup_temp_file(temp_path)
165
+
166
+ return "\n".join(output)
167
+
168
+ except Exception as e:
169
+ logger.error(f"Error in voice division search: {e}")
170
+ return f"Error: {str(e)}"
171
+
172
+
173
+ def search_contacts_text(query: str) -> str:
174
+ """Search for contacts based on text query"""
175
+ if not query or not query.strip():
176
+ return "Please enter a query."
177
+
178
+ try:
179
+ # Search contacts
180
+ contacts = contact_search_service.search_contacts(query, top_k=10, min_confidence=0.3)
181
+
182
+ # Extract names and divisions
183
+ names = name_extractor.extract_names(query)
184
+ division_matches = embedding_service.find_division(query, top_k=3)
185
+ matched_divisions = [m.division for m in division_matches]
186
+
187
+ return format_contact_results(contacts, names, matched_divisions)
188
+
189
+ except Exception as e:
190
+ logger.error(f"Error in contact search: {e}")
191
+ return f"Error: {str(e)}"
192
+
193
+
194
+ def search_contacts_voice(audio: Optional[Tuple[int, np.ndarray]]) -> str:
195
+ """Search for contacts based on voice query"""
196
+ if audio is None:
197
+ return "Please record audio first."
198
+
199
+ try:
200
+ # Save audio to temporary file
201
+ sample_rate, audio_data = audio
202
+ temp_path = voice_processor.save_audio_array(audio_data, sample_rate)
203
+
204
+ # Process voice query
205
+ voice_result = voice_processor.process_voice_query(temp_path)
206
+ query = voice_result['query']
207
+
208
+ # Search contacts
209
+ contacts = contact_search_service.search_contacts(query, top_k=10, min_confidence=0.3)
210
+
211
+ # Extract names and divisions
212
+ names = name_extractor.extract_names(query)
213
+ division_matches = embedding_service.find_division(query, top_k=3)
214
+ matched_divisions = [m.division for m in division_matches]
215
+
216
+ # Format output
217
+ output = []
218
+ output.append(f"**🎤 Transcribed Text:** {query}")
219
+ output.append(f"**🌍 Language:** {voice_result['language_name']}")
220
+ if voice_result['was_translated']:
221
+ output.append(f"**📝 Original:** {voice_result['original_text']}")
222
+ output.append("")
223
+ output.append(format_contact_results(contacts, names, matched_divisions))
224
+
225
+ # Cleanup
226
+ voice_processor.cleanup_temp_file(temp_path)
227
+
228
+ return "\n".join(output)
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error in voice contact search: {e}")
232
+ return f"Error: {str(e)}"
233
+
234
+
235
+ def create_demo():
236
+ """Create the Gradio demo interface"""
237
+
238
+ # Initialize services on startup
239
+ stats = initialize_services()
240
+
241
+ # Create the interface
242
+ with gr.Blocks(title="Voice Assistant Demo", theme=gr.themes.Soft()) as demo:
243
+
244
+ gr.Markdown(f"""
245
+ # 🎙️ Voice Assistant Demo
246
+ ### Multi-language voice assistant with division matching and contact search
247
+
248
+ **Database:** {stats['total_contacts']} contacts • {stats['departments']} departments • {stats['divisions']} divisions
249
+
250
+ **Features:**
251
+ - 🗣️ Speech-to-text in 99+ languages
252
+ - 🔍 Smart division matching
253
+ - 👤 Name extraction (English & Arabic)
254
+ - 📞 Contact search with confidence scoring
255
+ """)
256
+
257
+ with gr.Tabs():
258
+
259
+ # Tab 1: Division Matching (Text)
260
+ with gr.Tab("📝 Division Matching (Text)"):
261
+ gr.Markdown("""
262
+ ### Search for divisions by text query
263
+ Try queries like:
264
+ - "I need help from IT Security"
265
+ - "Find someone in Finance"
266
+ - "Connect me to Human Resources"
267
+ - "Find Ahmed in App Dev"
268
+ """)
269
+
270
+ with gr.Row():
271
+ with gr.Column():
272
+ div_text_input = gr.Textbox(
273
+ label="Enter your query",
274
+ placeholder="e.g., I need help from IT Security",
275
+ lines=2
276
+ )
277
+ div_text_btn = gr.Button("🔍 Search Divisions", variant="primary")
278
+
279
+ with gr.Column():
280
+ div_text_output = gr.Markdown(label="Results")
281
+
282
+ div_text_btn.click(
283
+ fn=search_divisions_text,
284
+ inputs=[div_text_input],
285
+ outputs=[div_text_output]
286
+ )
287
+
288
+ # Tab 2: Division Matching (Voice)
289
+ with gr.Tab("🎤 Division Matching (Voice)"):
290
+ gr.Markdown("""
291
+ ### Search for divisions by voice
292
+ Speak your query in any language - it will be automatically transcribed and translated.
293
+ """)
294
+
295
+ with gr.Row():
296
+ with gr.Column():
297
+ div_voice_input = gr.Audio(
298
+ sources=["microphone"],
299
+ type="numpy",
300
+ label="Record your voice query"
301
+ )
302
+ div_voice_btn = gr.Button("🔍 Search Divisions", variant="primary")
303
+
304
+ with gr.Column():
305
+ div_voice_output = gr.Markdown(label="Results")
306
+
307
+ div_voice_btn.click(
308
+ fn=search_divisions_voice,
309
+ inputs=[div_voice_input],
310
+ outputs=[div_voice_output]
311
+ )
312
+
313
+ # Tab 3: Contact Search (Text)
314
+ with gr.Tab("👥 Contact Search (Text)"):
315
+ gr.Markdown("""
316
+ ### Search for contacts by text query
317
+ Try queries like:
318
+ - "Find Dima in Information Technology"
319
+ - "Ahmed Al-Malek"
320
+ - "I need to talk to someone in Legal"
321
+ - "Find Rashed in Finance"
322
+ """)
323
+
324
+ with gr.Row():
325
+ with gr.Column():
326
+ contact_text_input = gr.Textbox(
327
+ label="Enter your query",
328
+ placeholder="e.g., Find Dima in Information Technology",
329
+ lines=2
330
+ )
331
+ contact_text_btn = gr.Button("🔍 Search Contacts", variant="primary")
332
+
333
+ with gr.Column():
334
+ contact_text_output = gr.Markdown(label="Results")
335
+
336
+ contact_text_btn.click(
337
+ fn=search_contacts_text,
338
+ inputs=[contact_text_input],
339
+ outputs=[contact_text_output]
340
+ )
341
+
342
+ # Tab 4: Contact Search (Voice)
343
+ with gr.Tab("🎙️ Contact Search (Voice)"):
344
+ gr.Markdown("""
345
+ ### Search for contacts by voice
346
+ Speak your query in any language to find contacts.
347
+ """)
348
+
349
+ with gr.Row():
350
+ with gr.Column():
351
+ contact_voice_input = gr.Audio(
352
+ sources=["microphone"],
353
+ type="numpy",
354
+ label="Record your voice query"
355
+ )
356
+ contact_voice_btn = gr.Button("🔍 Search Contacts", variant="primary")
357
+
358
+ with gr.Column():
359
+ contact_voice_output = gr.Markdown(label="Results")
360
+
361
+ contact_voice_btn.click(
362
+ fn=search_contacts_voice,
363
+ inputs=[contact_voice_input],
364
+ outputs=[contact_voice_output]
365
+ )
366
+
367
+ gr.Markdown("""
368
+ ---
369
+ **Models:**
370
+ - Embeddings: `sentence-transformers/all-MiniLM-L6-v2`
371
+ - Name Extraction: `urchade/gliner_small-v2.1`
372
+ - Speech-to-Text: `openai/whisper-tiny`
373
+
374
+ **Supported Languages:** 99+ languages (auto-detected)
375
+ """)
376
+
377
+ return demo
378
+
379
+
380
+ if __name__ == "__main__":
381
+ demo = create_demo()
382
+ demo.launch()
contact_search_service.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # contact_search_service.py
2
+ """
3
+ Contact search service with intelligent matching:
4
+ - Name-based search (exact and fuzzy matching)
5
+ - Division-based search
6
+ - Combined search (name + division)
7
+ - Confidence scoring
8
+ """
9
+
10
+ import logging
11
+ from typing import List, Dict, Optional, Tuple
12
+ from difflib import SequenceMatcher
13
+ import re
14
+
15
+ from contacts_data import (
16
+ get_all_contacts,
17
+ get_contacts_by_division,
18
+ get_contact_by_name
19
+ )
20
+ from name_extraction_service import NameExtractor
21
+ from embedding_service import EmbeddingService
22
+
23
+ # Set up logging
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ContactSearchService:
29
+ """
30
+ Service for searching contacts with intelligent matching.
31
+
32
+ Features:
33
+ - Exact name matching (100% confidence)
34
+ - Fuzzy name matching (partial names, typos)
35
+ - Division-based matching
36
+ - Combined search (name + division)
37
+ - Multi-language support (English and Arabic)
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ name_extractor: NameExtractor,
43
+ embedding_service: EmbeddingService
44
+ ):
45
+ """
46
+ Initialize the contact search service.
47
+
48
+ Args:
49
+ name_extractor: NameExtractor service for extracting names from queries
50
+ embedding_service: EmbeddingService for division matching
51
+ """
52
+ self.name_extractor = name_extractor
53
+ self.embedding_service = embedding_service
54
+ self.all_contacts = get_all_contacts()
55
+
56
+ logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts")
57
+
58
+ def search_contacts(
59
+ self,
60
+ query: str,
61
+ top_k: int = 10,
62
+ min_confidence: float = 0.3
63
+ ) -> List[Dict]:
64
+ """
65
+ Search for contacts based on query.
66
+
67
+ Process:
68
+ 1. Extract names from query
69
+ 2. Find matching divisions
70
+ 3. Match contacts by:
71
+ - Exact name match (if name found) → confidence = 1.0
72
+ - Fuzzy name match → confidence based on similarity
73
+ - Division match → confidence from embedding service
74
+ - Combined match (name + division) → boosted confidence
75
+ 4. Sort by confidence (exact matches first)
76
+
77
+ Args:
78
+ query: Search query (English or Arabic)
79
+ top_k: Maximum number of results to return
80
+ min_confidence: Minimum confidence threshold (0.0-1.0)
81
+
82
+ Returns:
83
+ List of matched contacts with confidence scores
84
+ """
85
+ logger.info(f"Searching contacts for query: '{query}'")
86
+
87
+ # Step 1: Extract names from query
88
+ extracted_names = self.name_extractor.extract_names(query)
89
+ logger.info(f"Extracted names: {extracted_names}")
90
+
91
+ # Step 2: Find matching divisions
92
+ division_matches = self.embedding_service.find_division(query, top_k=3)
93
+ logger.info(f"Found {len(division_matches)} division matches")
94
+
95
+ # Step 3: Match contacts
96
+ matched_contacts = []
97
+ has_names = len(extracted_names) > 0
98
+ has_divisions = len(division_matches) > 0
99
+ requested_divisions = [dm.division for dm in division_matches] if has_divisions else []
100
+
101
+ # Strategy A: If we have names, search by name
102
+ name_matches = {} # Track name matches by contact ID
103
+ if extracted_names:
104
+ for name in extracted_names:
105
+ # Try exact match first
106
+ exact_match = get_contact_by_name(name)
107
+ if exact_match:
108
+ contact_id = exact_match["id"]
109
+ name_matches[contact_id] = {
110
+ "contact": exact_match,
111
+ "confidence": 1.0,
112
+ "similarity": 1.0,
113
+ "match_type": "exact"
114
+ }
115
+ logger.info(f"✓ Exact name match: {exact_match['full_name_en']}")
116
+ else:
117
+ # Fuzzy name matching
118
+ fuzzy_matches = self._fuzzy_name_search(name, top_k=10)
119
+ for contact, similarity in fuzzy_matches:
120
+ contact_id = contact["id"]
121
+ # Only keep best match for each contact
122
+ if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]:
123
+ name_matches[contact_id] = {
124
+ "contact": contact,
125
+ "confidence": round(0.5 + (similarity * 0.45), 2),
126
+ "similarity": round(similarity, 2),
127
+ "match_type": "fuzzy"
128
+ }
129
+ logger.info(
130
+ f"Fuzzy name match: {contact['full_name_en']} "
131
+ f"(similarity: {similarity:.2f})"
132
+ )
133
+
134
+ # Strategy B: Division-based search
135
+ division_matches_dict = {} # Track division matches by contact ID
136
+ for div_match in division_matches:
137
+ division = div_match.division
138
+ division_confidence = div_match.confidence
139
+
140
+ # Get contacts in this division
141
+ division_contacts = get_contacts_by_division(division)
142
+
143
+ for contact in division_contacts:
144
+ contact_id = contact["id"]
145
+ # Only keep best division match for each contact
146
+ if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]:
147
+ division_matches_dict[contact_id] = {
148
+ "contact": contact,
149
+ "confidence": division_confidence,
150
+ "division": division
151
+ }
152
+
153
+ # Strategy C: Combine matches intelligently
154
+ # Priority when BOTH name and division are specified:
155
+ # 1. Name + Correct Division = HIGHEST (both match)
156
+ # 2. Correct Division only = HIGH (division is most important)
157
+ # 3. Name + Wrong Division = LOW (penalize wrong division)
158
+
159
+ all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys())
160
+
161
+ for contact_id in all_contact_ids:
162
+ has_name_match = contact_id in name_matches
163
+ has_division_match = contact_id in division_matches_dict
164
+
165
+ if has_name_match and has_division_match:
166
+ # BOTH name and division match - BEST CASE
167
+ name_data = name_matches[contact_id]
168
+ div_data = division_matches_dict[contact_id]
169
+ contact = name_data["contact"]
170
+
171
+ # When both match: take MAX of the two confidences and add a boost
172
+ # This ensures division + name is always better than division alone
173
+ combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15
174
+ combined_confidence = min(1.0, combined_confidence)
175
+
176
+ contact_result = contact.copy()
177
+ contact_result["confidence"] = round(combined_confidence, 2)
178
+ contact_result["match_reason"] = "name_and_division_match"
179
+ contact_result["name_confidence"] = name_data["confidence"]
180
+ contact_result["division_confidence"] = div_data["confidence"]
181
+ matched_contacts.append(contact_result)
182
+
183
+ logger.info(
184
+ f"✓ BOTH match: {contact['full_name_en']} in {div_data['division']} "
185
+ f"(final confidence: {contact_result['confidence']})"
186
+ )
187
+
188
+ elif has_division_match:
189
+ # Division match only (no name specified, or name didn't match this person)
190
+ div_data = division_matches_dict[contact_id]
191
+ contact = div_data["contact"]
192
+
193
+ contact_result = contact.copy()
194
+ contact_result["confidence"] = div_data["confidence"]
195
+ contact_result["match_reason"] = "division_match"
196
+ contact_result["division_confidence"] = div_data["confidence"]
197
+ matched_contacts.append(contact_result)
198
+
199
+ elif has_name_match:
200
+ # Name match but WRONG division (or no division specified)
201
+ name_data = name_matches[contact_id]
202
+ contact = name_data["contact"]
203
+
204
+ # If division was specified in query AND has reasonable confidence (>= 0.58)
205
+ # Apply penalty for being in wrong division
206
+ # If division confidence is very low (< 0.58), treat as name-only search
207
+ # This threshold helps avoid false division matches from words like "Find" (which scores ~0.56)
208
+ # while still catching abbreviations like "App Dev" (which scores ~0.59)
209
+ has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58
210
+
211
+ if has_strong_division_match:
212
+ # Heavy penalty for wrong division when division was specified with confidence
213
+ penalized_confidence = name_data["confidence"] * 0.3 # 70% penalty
214
+ contact_result = contact.copy()
215
+ contact_result["confidence"] = round(penalized_confidence, 2)
216
+ contact_result["match_reason"] = "name_match_wrong_division"
217
+ contact_result["name_confidence"] = name_data["confidence"]
218
+ contact_result["requested_division"] = ", ".join(requested_divisions[:2])
219
+ matched_contacts.append(contact_result)
220
+
221
+ logger.info(
222
+ f"Name match with WRONG division: {contact['full_name_en']} "
223
+ f"in {contact['division']} (wanted: {requested_divisions[0]}, "
224
+ f"confidence: {contact_result['confidence']})"
225
+ )
226
+ else:
227
+ # No division specified OR weak division match - use name confidence as-is
228
+ contact_result = contact.copy()
229
+ contact_result["confidence"] = name_data["confidence"]
230
+ contact_result["match_reason"] = f"{name_data['match_type']}_name_match"
231
+ contact_result["name_confidence"] = name_data["confidence"]
232
+ matched_contacts.append(contact_result)
233
+
234
+ # Step 4: Remove duplicates (keep highest confidence)
235
+ unique_contacts = {}
236
+ for contact in matched_contacts:
237
+ contact_id = contact["id"]
238
+ if contact_id not in unique_contacts:
239
+ unique_contacts[contact_id] = contact
240
+ else:
241
+ # Keep the one with higher confidence
242
+ if contact["confidence"] > unique_contacts[contact_id]["confidence"]:
243
+ unique_contacts[contact_id] = contact
244
+
245
+ # Convert back to list
246
+ matched_contacts = list(unique_contacts.values())
247
+
248
+ # Step 5: Filter by minimum confidence
249
+ matched_contacts = [
250
+ c for c in matched_contacts if c["confidence"] >= min_confidence
251
+ ]
252
+
253
+ # Step 6: Sort by confidence (descending) - exact matches will be first
254
+ matched_contacts.sort(key=lambda x: x["confidence"], reverse=True)
255
+
256
+ # Step 7: Limit to top_k
257
+ matched_contacts = matched_contacts[:top_k]
258
+
259
+ logger.info(f"✓ Returning {len(matched_contacts)} matched contacts")
260
+
261
+ return matched_contacts
262
+
263
+ def _fuzzy_name_search(
264
+ self,
265
+ name: str,
266
+ top_k: int = 5,
267
+ min_similarity: float = 0.75 # Increased from 0.6 to avoid false matches
268
+ ) -> List[Tuple[Dict, float]]:
269
+ """
270
+ Fuzzy name matching using string similarity with stricter rules.
271
+
272
+ Args:
273
+ name: Name to search for
274
+ top_k: Number of top matches to return
275
+ min_similarity: Minimum similarity threshold (0.0-1.0)
276
+
277
+ Returns:
278
+ List of (contact, similarity_score) tuples
279
+ """
280
+ matches = []
281
+
282
+ # Normalize name for comparison
283
+ name_normalized = self._normalize_name(name)
284
+
285
+ # Get first letter for initial matching (helps avoid false positives)
286
+ name_first_letter = name_normalized[0] if name_normalized else ''
287
+
288
+ for contact in self.all_contacts:
289
+ # Check against both Arabic and English names
290
+ full_name_en_normalized = self._normalize_name(contact["full_name_en"])
291
+ full_name_ar_normalized = self._normalize_name(contact["full_name_ar"])
292
+ first_name_en_normalized = self._normalize_name(contact["first_name_en"])
293
+ first_name_ar_normalized = self._normalize_name(contact["first_name_ar"])
294
+ last_name_en_normalized = self._normalize_name(contact["last_name_en"])
295
+ last_name_ar_normalized = self._normalize_name(contact["last_name_ar"])
296
+
297
+ # Calculate similarity against various name combinations
298
+ name_candidates = [
299
+ (full_name_en_normalized, "full_en"),
300
+ (full_name_ar_normalized, "full_ar"),
301
+ (first_name_en_normalized, "first_en"),
302
+ (first_name_ar_normalized, "first_ar"),
303
+ (last_name_en_normalized, "last_en"),
304
+ (last_name_ar_normalized, "last_ar"),
305
+ ]
306
+
307
+ best_similarity = 0
308
+ best_match_type = None
309
+
310
+ for candidate_name, match_type in name_candidates:
311
+ if not candidate_name:
312
+ continue
313
+
314
+ similarity = self._string_similarity(name_normalized, candidate_name)
315
+
316
+ # Apply stricter rules for fuzzy matching:
317
+ # 1. Names should start with the same letter (for English names)
318
+ # 2. Or have very high similarity (>= 0.85)
319
+ if match_type.endswith('_en'):
320
+ candidate_first_letter = candidate_name[0] if candidate_name else ''
321
+ # Require same first letter OR very high similarity
322
+ if candidate_first_letter != name_first_letter and similarity < 0.85:
323
+ continue # Skip this match
324
+
325
+ if similarity > best_similarity:
326
+ best_similarity = similarity
327
+ best_match_type = match_type
328
+
329
+ if best_similarity >= min_similarity:
330
+ matches.append((contact, best_similarity))
331
+
332
+ # Sort by similarity (descending)
333
+ matches.sort(key=lambda x: x[1], reverse=True)
334
+
335
+ return matches[:top_k]
336
+
337
+ def _normalize_name(self, name: str) -> str:
338
+ """Normalize name for comparison (lowercase, remove extra spaces)"""
339
+ return re.sub(r'\s+', ' ', name.strip().lower())
340
+
341
+ def _string_similarity(self, s1: str, s2: str) -> float:
342
+ """
343
+ Calculate string similarity using SequenceMatcher.
344
+
345
+ Returns:
346
+ Similarity score between 0.0 and 1.0
347
+ """
348
+ return SequenceMatcher(None, s1, s2).ratio()
349
+
350
+ def get_contact_stats(self) -> Dict:
351
+ """Get statistics about the contact database"""
352
+ from collections import Counter
353
+
354
+ dept_counts = Counter(contact["department"] for contact in self.all_contacts)
355
+ div_counts = Counter(contact["division"] for contact in self.all_contacts)
356
+
357
+ return {
358
+ "total_contacts": len(self.all_contacts),
359
+ "departments": len(dept_counts),
360
+ "divisions": len(div_counts),
361
+ "contacts_by_department": dict(dept_counts),
362
+ "contacts_by_division": dict(div_counts),
363
+ }
364
+
365
+
366
+ if __name__ == "__main__":
367
+ # Test the contact search service
368
+ from name_extraction_service import NameExtractor
369
+ from embedding_service import EmbeddingService
370
+
371
+ print("Initializing services...")
372
+ name_extractor = NameExtractor()
373
+ embedding_service = EmbeddingService()
374
+ search_service = ContactSearchService(name_extractor, embedding_service)
375
+
376
+ print("\nContact Database Stats:")
377
+ stats = search_service.get_contact_stats()
378
+ print(f"Total contacts: {stats['total_contacts']}")
379
+ print(f"Departments: {stats['departments']}")
380
+ print(f"Divisions: {stats['divisions']}")
381
+
382
+ # Test queries
383
+ test_queries = [
384
+ "Find Ahmed in IT",
385
+ "I need to talk to someone in HR",
386
+ "محمد في المالية", # "Mohammed in Finance" in Arabic
387
+ "Finance accounting help",
388
+ ]
389
+
390
+ print("\n" + "="*80)
391
+ print("Testing Contact Search")
392
+ print("="*80)
393
+
394
+ for query in test_queries:
395
+ print(f"\nQuery: '{query}'")
396
+ print("-" * 80)
397
+
398
+ results = search_service.search_contacts(query, top_k=3)
399
+
400
+ if results:
401
+ for i, contact in enumerate(results, 1):
402
+ print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})")
403
+ print(f" {contact['title_en']} - {contact['division']}")
404
+ print(f" {contact['email']} | Ext: {contact['extension']}")
405
+ print(f" Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}")
406
+ else:
407
+ print("No matches found.")
contacts_data.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # contacts_data.py
2
+ """
3
+ Contact database with 500 fake contacts covering all departments and divisions.
4
+ Each contact has Arabic and English names for better search support.
5
+ """
6
+
7
+ from typing import List, Dict
8
+ from division_hierarchy import DIVISION_TO_DEPARTMENT
9
+ import random
10
+
11
+ # Common Arabic first names (male and female)
12
+ ARABIC_FIRST_NAMES_MALE = [
13
+ "محمد", "أحمد", "عبدالله", "عمر", "خالد", "سعد", "فيصل", "سلطان", "ناصر", "طلال",
14
+ "عبدالعزيز", "فهد", "تركي", "سلمان", "بندر", "مشعل", "ماجد", "يوسف", "حسن", "علي",
15
+ "وليد", "زياد", "رامي", "كريم", "عادل", "راشد", "مازن", "طارق", "إبراهим", "عيسى",
16
+ "نواف", "سامي", "بدر", "عاصم", "وسام", "هاني", "ثامر", "صالح", "ياسر", "جاسم",
17
+ "هشام", "فواز", "معاذ", "عثمان", "أسامة", "باسل", "عمار", "نبيل", "توفيق", "جمال"
18
+ ]
19
+
20
+ ARABIC_FIRST_NAMES_FEMALE = [
21
+ "فاطمة", "نورة", "سارة", "منى", "هند", "ريم", "لينا", "دانة", "شهد", "جود",
22
+ "رهف", "غلا", "عبير", "أمل", "ندى", "رنا", "لمى", "ديمة", "بشرى", "سمية",
23
+ "هيفاء", "ليلى", "زينب", "خلود", "شروق", "أريج", "جميلة", "رباب", "سلمى", "وفاء",
24
+ "عائشة", "خديجة", "مريم", "رقية", "زهراء", "نجود", "حصة", "عزة", "صفية", "ملاك",
25
+ "روان", "تالا", "جنى", "لين", "ريتاج", "أسماء", "سديم", "لمار", "بيان", "شيماء"
26
+ ]
27
+
28
+ # Common Arabic last names
29
+ ARABIC_LAST_NAMES = [
30
+ "العتيبي", "الدوسري", "القحطاني", "الشهري", "الغامدي", "الزهراني", "العنزي", "الحربي",
31
+ "المطيري", "العسيري", "السبيعي", "الشمري", "الجهني", "العمري", "البقمي", "الفهد",
32
+ "السديري", "الثبيتي", "الصقري", "الأحمد", "الخالد", "السليمان", "العبدالله", "الفهيد",
33
+ "الشايع", "الرشيد", "العجمي", "المالك", "الفريح", "الحمود", "الناصر", "الشريف",
34
+ "البلوي", "اليامي", "الوادعي", "الفيفي", "الشهراني", "البكري", "العسكر", "الراشد",
35
+ "الفايز", "الخليف", "المنيع", "العبيد", "السحيم", "الغنام", "السلمان", "الهاجري",
36
+ "النهدي", "الرويلي", "المري", "السواط", "الربيعان", "الدغيثر", "الفضلي", "القرني",
37
+ "الثنيان", "العريفي", "الهويدي", "الجريسي", "البدراني", "المهيدب", "السالم", "الحارثي",
38
+ "العطوي", "الصخري", "الرحيلي", "السعيد", "الحافظ", "الوهيبي", "البراك", "الضويان"
39
+ ]
40
+
41
+ # Job titles in English and Arabic by category
42
+ JOB_TITLES = {
43
+ "executive": [
44
+ ("Chief Executive Officer", "المدير التنفيذي"),
45
+ ("Executive Director", "المدير التنفيذي"),
46
+ ("Vice President", "نائب الرئيس"),
47
+ ("Senior Vice President", "نائب الرئيس الأول"),
48
+ ],
49
+ "management": [
50
+ ("Director", "مدير"),
51
+ ("Senior Manager", "مدير أول"),
52
+ ("Manager", "مدير"),
53
+ ("Assistant Manager", "مساعد مدير"),
54
+ ("Team Leader", "قائد فريق"),
55
+ ("Supervisor", "مشرف"),
56
+ ],
57
+ "specialist": [
58
+ ("Senior Specialist", "أخصائي أول"),
59
+ ("Specialist", "أخصائي"),
60
+ ("Senior Analyst", "محلل أول"),
61
+ ("Analyst", "محلل"),
62
+ ("Senior Consultant", "مستشار أول"),
63
+ ("Consultant", "مستشار"),
64
+ ("Senior Officer", "موظف أول"),
65
+ ("Officer", "موظف"),
66
+ ],
67
+ "technical": [
68
+ ("Senior Engineer", "مهندس أول"),
69
+ ("Engineer", "مهندس"),
70
+ ("Technical Lead", "قائد تقني"),
71
+ ("Developer", "مطور"),
72
+ ("Architect", "مهندس معماري"),
73
+ ],
74
+ "support": [
75
+ ("Coordinator", "منسق"),
76
+ ("Administrator", "إداري"),
77
+ ("Assistant", "مساعد"),
78
+ ("Associate", "معاون"),
79
+ ]
80
+ }
81
+
82
+ # Phone extensions (4-digit)
83
+ def generate_extension() -> str:
84
+ """Generate a 4-digit phone extension"""
85
+ return str(random.randint(1000, 9999))
86
+
87
+ # Email generation
88
+ def generate_email(first_name_en: str, last_name_en: str) -> str:
89
+ """Generate an email address"""
90
+ # Remove spaces and special characters
91
+ first = first_name_en.lower().replace(" ", "").replace("-", "")
92
+ last = last_name_en.lower().replace(" ", "").replace("-", "")
93
+ return f"{first}.{last}@sidf.gov.sa"
94
+
95
+
96
+ def transliterate_arabic_name(arabic_name: str) -> str:
97
+ """
98
+ Simple transliteration of Arabic names to English.
99
+ This is a basic mapping for common names.
100
+ """
101
+ transliteration_map = {
102
+ # Male names
103
+ "محمد": "Mohammed", "أحمد": "Ahmed", "عبدالله": "Abdullah", "عمر": "Omar", "خالد": "Khalid",
104
+ "سعد": "Saad", "فيصل": "Faisal", "سلطان": "Sultan", "ناصر": "Nasser", "طلال": "Talal",
105
+ "عبدالعزيز": "Abdulaziz", "فهد": "Fahad", "تركي": "Turki", "سلمان": "Salman", "بندر": "Bandar",
106
+ "مشعل": "Mishaal", "ماجد": "Majed", "يوسف": "Yousef", "حسن": "Hassan", "علي": "Ali",
107
+ "وليد": "Waleed", "زياد": "Ziyad", "رامي": "Rami", "كريم": "Kareem", "عادل": "Adel",
108
+ "راشد": "Rashed", "مازن": "Mazen", "طارق": "Tariq", "إبراهim": "Ibrahim", "عيسى": "Issa",
109
+ "نواف": "Nawaf", "سامي": "Sami", "بدر": "Badr", "عاصم": "Asim", "وسام": "Wissam",
110
+ "هاني": "Hani", "ثامر": "Thamer", "صالح": "Saleh", "ياسر": "Yasser", "جاسم": "Jasim",
111
+ "هشام": "Hisham", "فواز": "Fawaz", "معاذ": "Muath", "عثمان": "Othman", "أسامة": "Osama",
112
+ "باسل": "Basel", "عمار": "Ammar", "نبيل": "Nabil", "توفيق": "Tawfiq", "جمال": "Jamal",
113
+
114
+ # Female names
115
+ "فاطمة": "Fatima", "نورة": "Noura", "سارة": "Sarah", "منى": "Mona", "هند": "Hind",
116
+ "ريم": "Reem", "لينا": "Lina", "دانة": "Dana", "شهد": "Shahad", "جود": "Joud",
117
+ "رهف": "Rahaf", "غلا": "Ghala", "عبير": "Abeer", "أمل": "Amal", "ندى": "Nada",
118
+ "رنا": "Rana", "لمى": "Lama", "ديمة": "Dima", "بشرى": "Bushra", "سمية": "Somaya",
119
+ "هيفاء": "Haifa", "ليلى": "Layla", "زينب": "Zainab", "خلود": "Kholoud", "شروق": "Shorouq",
120
+ "أريج": "Areej", "جميلة": "Jamila", "رباب": "Rabab", "سلمى": "Salma", "وفاء": "Wafa",
121
+ "عائشة": "Aisha", "خديجة": "Khadija", "مريم": "Maryam", "رقية": "Ruqaya", "زهراء": "Zahra",
122
+ "نجود": "Nujoud", "حصة": "Hessa", "عزة": "Azza", "صفية": "Safiya", "ملاك": "Malak",
123
+ "روان": "Rawan", "تالا": "Tala", "جنى": "Jana", "لين": "Leen", "ريتاج": "Ritaj",
124
+ "أسماء": "Asma", "سديم": "Sadeem", "لمار": "Lamar", "بيان": "Bayan", "شيماء": "Shaima",
125
+
126
+ # Last names
127
+ "العتيبي": "Al-Otaibi", "الدوسري": "Al-Dosari", "القحطاني": "Al-Qahtani", "الشهري": "Al-Shahri",
128
+ "الغامدي": "Al-Ghamdi", "الزهراني": "Al-Zahrani", "العنزي": "Al-Anazi", "الحربي": "Al-Harbi",
129
+ "المطيري": "Al-Mutairi", "العسيري": "Al-Asiri", "السبيعي": "Al-Subaie", "الشمري": "Al-Shammari",
130
+ "الجهني": "Al-Juhani", "العمري": "Al-Omari", "البقمي": "Al-Buqami", "الفهد": "Al-Fahad",
131
+ "السديري": "Al-Sudairi", "الثبيتي": "Al-Thubaiti", "الصقري": "Al-Saqri", "الأحمد": "Al-Ahmad",
132
+ "الخالد": "Al-Khalid", "السليمان": "Al-Sulaiman", "العبدالله": "Al-Abdullah", "الفهيد": "Al-Fahaid",
133
+ "الشايع": "Al-Shaya", "الرشيد": "Al-Rasheed", "العجمي": "Al-Ajmi", "المالك": "Al-Malek",
134
+ "الفريح": "Al-Fraihi", "الحمود": "Al-Hamoud", "الناصر": "Al-Nasser", "الشريف": "Al-Shareef",
135
+ "البلوي": "Al-Balawi", "اليامي": "Al-Yami", "الوادعي": "Al-Wadei", "الفيفي": "Al-Faifi",
136
+ "الشهراني": "Al-Shahrani", "البكري": "Al-Bakri", "العسكر": "Al-Askar", "الراشد": "Al-Rashed",
137
+ "الفايز": "Al-Fayez", "الخليف": "Al-Khleif", "المنيع": "Al-Manie", "العبيد": "Al-Obaid",
138
+ "السحيم": "Al-Suhaim", "الغنام": "Al-Ghannam", "السلمان": "Al-Salman", "الهاجري": "Al-Hajri",
139
+ "النهدي": "Al-Nahdi", "الرويلي": "Al-Ruwaili", "المري": "Al-Marri", "السواط": "Al-Sawat",
140
+ "الربيعان": "Al-Rabian", "الدغيثر": "Al-Dughither", "الفضلي": "Al-Fadhli", "القرني": "Al-Qarni",
141
+ "الثنيان": "Al-Thuniyan", "العريفي": "Al-Arifi", "الهويدي": "Al-Huwaidi", "الجريسي": "Al-Juraysi",
142
+ "البدراني": "Al-Badrani", "المهيدب": "Al-Muhaidib", "السالم": "Al-Salem", "الحارثي": "Al-Harthi",
143
+ "العطوي": "Al-Atawi", "الصخري": "Al-Sakhri", "الرحيلي": "Al-Rahili", "السعيد": "Al-Saeed",
144
+ "الحافظ": "Al-Hafiz", "الوهيبي": "Al-Wahaibi", "البراك": "Al-Barrak", "الضويان": "Al-Dhuwayan",
145
+ }
146
+ return transliteration_map.get(arabic_name, arabic_name)
147
+
148
+
149
+ def generate_contacts() -> List[Dict]:
150
+ """
151
+ Generate 500 fake contacts distributed across all divisions.
152
+ Returns a list of contact dictionaries.
153
+ """
154
+ contacts = []
155
+ contact_id = 1000 # Starting ID
156
+
157
+ # Get all divisions
158
+ divisions = list(DIVISION_TO_DEPARTMENT.keys())
159
+
160
+ # Calculate contacts per division (aim for ~7-8 per division)
161
+ contacts_per_division = 500 // len(divisions)
162
+ extra_contacts = 500 % len(divisions)
163
+
164
+ for div_index, division in enumerate(divisions):
165
+ department_name, department_id = DIVISION_TO_DEPARTMENT[division]
166
+
167
+ # Number of contacts for this division
168
+ num_contacts = contacts_per_division
169
+ if div_index < extra_contacts:
170
+ num_contacts += 1
171
+
172
+ # Determine seniority distribution (more junior staff than senior)
173
+ # 10% executive, 20% management, 50% specialist, 15% technical, 5% support
174
+ seniority_distribution = []
175
+ seniority_distribution.extend(["executive"] * max(1, int(num_contacts * 0.10)))
176
+ seniority_distribution.extend(["management"] * max(1, int(num_contacts * 0.20)))
177
+ seniority_distribution.extend(["specialist"] * max(1, int(num_contacts * 0.50)))
178
+ seniority_distribution.extend(["technical"] * max(1, int(num_contacts * 0.15)))
179
+ seniority_distribution.extend(["support"] * max(1, int(num_contacts * 0.05)))
180
+
181
+ # Ensure we have exactly num_contacts
182
+ while len(seniority_distribution) < num_contacts:
183
+ seniority_distribution.append("specialist")
184
+ seniority_distribution = seniority_distribution[:num_contacts]
185
+
186
+ random.shuffle(seniority_distribution)
187
+
188
+ for i in range(num_contacts):
189
+ # Mix male and female names (60% male, 40% female)
190
+ is_male = random.random() < 0.6
191
+
192
+ if is_male:
193
+ first_name_ar = random.choice(ARABIC_FIRST_NAMES_MALE)
194
+ else:
195
+ first_name_ar = random.choice(ARABIC_FIRST_NAMES_FEMALE)
196
+
197
+ last_name_ar = random.choice(ARABIC_LAST_NAMES)
198
+
199
+ # Transliterate to English
200
+ first_name_en = transliterate_arabic_name(first_name_ar)
201
+ last_name_en = transliterate_arabic_name(last_name_ar)
202
+
203
+ # Full names
204
+ full_name_ar = f"{first_name_ar} {last_name_ar}"
205
+ full_name_en = f"{first_name_en} {last_name_en}"
206
+
207
+ # Get job title based on seniority
208
+ seniority = seniority_distribution[i]
209
+ title_en, title_ar = random.choice(JOB_TITLES[seniority])
210
+
211
+ # Generate contact info
212
+ extension = generate_extension()
213
+ email = generate_email(first_name_en, last_name_en)
214
+
215
+ contact = {
216
+ "id": contact_id,
217
+ "first_name_ar": first_name_ar,
218
+ "last_name_ar": last_name_ar,
219
+ "full_name_ar": full_name_ar,
220
+ "first_name_en": first_name_en,
221
+ "last_name_en": last_name_en,
222
+ "full_name_en": full_name_en,
223
+ "title_en": title_en,
224
+ "title_ar": title_ar,
225
+ "division": division,
226
+ "department": department_name,
227
+ "department_id": department_id,
228
+ "email": email,
229
+ "extension": extension,
230
+ "phone": f"+966-11-218-{extension}",
231
+ }
232
+
233
+ contacts.append(contact)
234
+ contact_id += 1
235
+
236
+ return contacts
237
+
238
+
239
+ # Generate contacts on module load
240
+ CONTACTS_DATABASE = generate_contacts()
241
+
242
+ # Create indexes for fast lookup
243
+ CONTACTS_BY_NAME_AR = {contact["full_name_ar"]: contact for contact in CONTACTS_DATABASE}
244
+ CONTACTS_BY_NAME_EN = {contact["full_name_en"]: contact for contact in CONTACTS_DATABASE}
245
+ CONTACTS_BY_DIVISION = {}
246
+ for contact in CONTACTS_DATABASE:
247
+ division = contact["division"]
248
+ if division not in CONTACTS_BY_DIVISION:
249
+ CONTACTS_BY_DIVISION[division] = []
250
+ CONTACTS_BY_DIVISION[division].append(contact)
251
+
252
+
253
+ def get_all_contacts() -> List[Dict]:
254
+ """Get all contacts"""
255
+ return CONTACTS_DATABASE
256
+
257
+
258
+ def get_contacts_by_division(division: str) -> List[Dict]:
259
+ """Get contacts for a specific division"""
260
+ return CONTACTS_BY_DIVISION.get(division, [])
261
+
262
+
263
+ def get_contact_by_name(name: str) -> Dict:
264
+ """Get contact by exact name (Arabic or English)"""
265
+ # Try Arabic first
266
+ contact = CONTACTS_BY_NAME_AR.get(name)
267
+ if contact:
268
+ return contact
269
+
270
+ # Try English
271
+ contact = CONTACTS_BY_NAME_EN.get(name)
272
+ if contact:
273
+ return contact
274
+
275
+ return None
276
+
277
+
278
+ if __name__ == "__main__":
279
+ # Test the contact generation
280
+ contacts = get_all_contacts()
281
+ print(f"Generated {len(contacts)} contacts")
282
+ print(f"\nSample contacts:")
283
+ for i, contact in enumerate(contacts[:5]):
284
+ print(f"{i+1}. {contact['full_name_en']} ({contact['full_name_ar']})")
285
+ print(f" {contact['title_en']} - {contact['division']}")
286
+ print(f" {contact['email']} | Ext: {contact['extension']}")
287
+ print()
288
+
289
+ # Show distribution by department
290
+ from collections import Counter
291
+ dept_counts = Counter(contact["department"] for contact in contacts)
292
+ print("\nContacts by Department:")
293
+ for dept, count in sorted(dept_counts.items(), key=lambda x: -x[1]):
294
+ print(f" {dept}: {count}")
division_hierarchy.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # division_hierarchy.py
2
+ """
3
+ Mapping of specific divisions to parent departments
4
+ """
5
+
6
+ # Parent Department IDs and Names
7
+ DEPARTMENTS = {
8
+ "40000003": "Information Technology",
9
+ "40000010": "Finance",
10
+ "40000005": "Human Resources",
11
+ "40000009": "Legal",
12
+ "40000011": "Communication",
13
+ "40000012": "Business Development",
14
+ "40000004": "Strategic Planning",
15
+ "40000245": "Project Studies",
16
+ "40000246": "Market Studies",
17
+ "40020002": "Services and Facilities",
18
+ "40020003": "Purchasing and Contracts",
19
+ "40000794": "Governance and Compliance",
20
+ "40000638": "Credit Risk",
21
+ "40000565": "Portfolio Management",
22
+ "40000564": "Credit Relationship",
23
+ "40020004": "Credit Control Division",
24
+ "40000013": "Loans Support Division",
25
+ "40000667": "Enterprise Risk Management",
26
+ "40001089": "Audit Excellence Unit",
27
+ "40000096": "Operations Audit Division",
28
+ "40001088": "Credit Audit Division",
29
+ "40000097": "IT Audit Division",
30
+ "40000520": "Cyber Security Division",
31
+ "40020001": "SIDF Academy",
32
+ "40001188": "Corporate Excellence",
33
+ "40000522": "CEO Advisory Office",
34
+ "40000523": "Board Secretary",
35
+ "40020005": "Chief of Staff Office",
36
+ "40020000": "ILab",
37
+ "40000488": "Secondment",
38
+ "40001063": "Credit Analysis & Advisory Division",
39
+ "40000089": "VP Office Credit",
40
+ }
41
+
42
+ # Mapping: Division Name -> (Department Name, Department ID)
43
+ DIVISION_TO_DEPARTMENT = {
44
+ # FINANCE
45
+ "General Accounting Division": ("Finance", "40000010"),
46
+ "Accounts Payable Division": ("Finance", "40000010"),
47
+ "Accounts Receivable Division": ("Finance", "40000010"),
48
+ "Financial Reporting & Control Division": ("Finance", "40000010"),
49
+ "Audit & Financial Analysis": ("Finance", "40000010"),
50
+
51
+ # INFORMATION TECHNOLOGY
52
+ "IT Governance & Quality Division": ("Information Technology", "40000003"),
53
+ "Applications Development & Integrations": ("Information Technology", "40000003"),
54
+ "IT Infrastructure & Operations Div.": ("Information Technology", "40000003"),
55
+ "Applications Maintenance & Support Div.": ("Information Technology", "40000003"),
56
+ "IT Security Implementation & Operations": ("Information Technology", "40000003"),
57
+ "IT Manager's Office": ("Information Technology", "40000003"),
58
+ "Enterprise Architecture Team": ("Information Technology", "40000003"),
59
+ "Reporting & Data Analytics Unit": ("Information Technology", "40000003"),
60
+
61
+ # HUMAN RESOURCES
62
+ "Rewards & Hr Operations Division": ("Human Resources", "40000005"),
63
+ "HR Business Partner": ("Human Resources", "40000005"),
64
+ "Talent Development Division": ("Human Resources", "40000005"),
65
+ "Od & Talent Acquisition Division": ("Human Resources", "40000005"),
66
+ "Org.Culture & Initiatives Division": ("Human Resources", "40000005"),
67
+
68
+ # LEGAL
69
+ "Legal Agreements and Consultancy Division": ("Legal", "40000009"),
70
+ "Contracts & Mortgages and Guarantees Division": ("Legal", "40000009"),
71
+ "Cases Division": ("Legal", "40000009"),
72
+
73
+ # COMMUNICATION
74
+ "Public Relations & Media Division": ("Communication", "40000011"),
75
+ "Internal Communication Unit": ("Communication", "40000011"),
76
+
77
+ # BUSINESS DEVELOPMENT
78
+ "Marketing Division": ("Business Development", "40000012"),
79
+ "Partnerships Dev&Advisory Services Div": ("Business Development", "40000012"),
80
+
81
+ # STRATEGIC PLANNING
82
+ "Strategy Division": ("Strategic Planning", "40000004"),
83
+ "Corporate Performance Division": ("Strategic Planning", "40000004"),
84
+ "Knowledge Management Unit": ("Strategic Planning", "40000004"),
85
+
86
+ # PROJECTS
87
+ "Project Management Division": ("Project Studies", "40000245"),
88
+ "Projects Consultancy Division": ("Project Studies", "40000245"),
89
+ "Construction Consultancy Division": ("Project Studies", "40000245"),
90
+
91
+ # MARKET RESEARCH
92
+ "Market Research Div.": ("Market Studies", "40000246"),
93
+ "Market Studies Division": ("Market Studies", "40000246"),
94
+ "Business Intelligence Div.": ("Market Studies", "40000246"),
95
+
96
+ # FACILITIES
97
+ "Facilities Management Division": ("Services and Facilities", "40020002"),
98
+ "Documents and Administrative Communications Center": ("Services and Facilities", "40020002"),
99
+ "Security and Safety Division": ("Services and Facilities", "40020002"),
100
+ "Industrial Safety & Loss Prevention Div": ("Services and Facilities", "40020002"),
101
+
102
+ # PURCHASING
103
+ "Procurements Contracts and Vendors Division": ("Purchasing and Contracts", "40020003"),
104
+ "Purchasing Division": ("Purchasing and Contracts", "40020003"),
105
+
106
+ # GOVERNANCE
107
+ "Enterprise Governance Division": ("Governance and Compliance", "40000794"),
108
+ "Compliance Regulatory Division": ("Governance and Compliance", "40000794"),
109
+ "Compliance Operations Division": ("Governance and Compliance", "40000794"),
110
+ "Operational Risk Management Division": ("Enterprise Risk Management", "40000667"),
111
+ "Financial Risk Management Division": ("Enterprise Risk Management", "40000667"),
112
+ "Cybersecurity Governance Unit": ("Cyber Security Division", "40000520"),
113
+ "Cybersecurity Defense Unit": ("Cyber Security Division", "40000520"),
114
+
115
+ # CREDIT
116
+ "Credit Risk Division": ("Credit Risk", "40000638"),
117
+ "Credit Relationships Division": ("Credit Relationship", "40000564"),
118
+ "Credit Control Team - A": ("Credit Control Division", "40020004"),
119
+ "Credit Control Team - B": ("Credit Control Division", "40020004"),
120
+ "Collection Team": ("Credit Control Division", "40020004"),
121
+ "Loan Follow-Up Team": ("Loans Support Division", "40000013"),
122
+ "C&Lm Info Team": ("Credit Risk", "40000638"),
123
+
124
+ # PORTFOLIO
125
+ "Portfolio Manager'S Office": ("Portfolio Management", "40000565"),
126
+ "Portfolio Division": ("Portfolio Management", "40000565"),
127
+ "Special Assets Division": ("Portfolio Management", "40000565"),
128
+
129
+ # EXCELLENCE
130
+ "Programs Delivery Operations Division": ("Corporate Excellence", "40001188"),
131
+ "Customer Excellence Division": ("Corporate Excellence", "40001188"),
132
+ "Programs Design And Needs Assessment Division": ("Corporate Excellence", "40001188"),
133
+ "Programs Evaluation & Quality Assurance Division": ("Corporate Excellence", "40001188"),
134
+ "Product Dev For National Priorities Div": ("Corporate Excellence", "40001188"),
135
+ "Product Dev For Customer Empowerment Div": ("Corporate Excellence", "40001188"),
136
+ "Solutions & Design Division": ("Corporate Excellence", "40001188"),
137
+
138
+ # AUDIT
139
+ "Operations Audit Team": ("Operations Audit Division", "40000096"),
140
+
141
+ # ACADEMY
142
+ "Academy Strategic Partnerships Division": ("SIDF Academy", "40020001"),
143
+
144
+ # EXECUTIVE
145
+ "CEO Office": ("CEO Advisory Office", "40000522"),
146
+ }
147
+
148
+
149
+ def get_department(division_name: str) -> tuple:
150
+ """
151
+ Get the parent department for a division.
152
+
153
+ Args:
154
+ division_name: Name of the specific division
155
+
156
+ Returns:
157
+ Tuple of (department_name, department_id) or (None, None) if not found
158
+ """
159
+ return DIVISION_TO_DEPARTMENT.get(division_name, (None, None))
160
+
161
+
162
+ def get_department_name(division_name: str) -> str:
163
+ """
164
+ Get just the department name for a division.
165
+
166
+ Args:
167
+ division_name: Name of the specific division
168
+
169
+ Returns:
170
+ Department name or "Unknown" if not found
171
+ """
172
+ dept_name, _ = get_department(division_name)
173
+ return dept_name or "Unknown"
174
+
175
+
176
+ def get_department_id(division_name: str) -> str:
177
+ """
178
+ Get just the department ID for a division.
179
+
180
+ Args:
181
+ division_name: Name of the specific division
182
+
183
+ Returns:
184
+ Department ID or None if not found
185
+ """
186
+ _, dept_id = get_department(division_name)
187
+ return dept_id
embedding_service.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embedding_service.py
2
+ import logging
3
+ import numpy as np
4
+ from typing import List, Dict, Any
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from models import ExtractedInfo
8
+ from division_hierarchy import get_department_name
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class EmbeddingService:
16
+ """
17
+ Service for fast division matching using sentence embeddings.
18
+
19
+ How it works:
20
+ 1. At startup: Encode all divisions into vectors (one-time cost)
21
+ 2. For each query: Encode query and find most similar division (fast!)
22
+
23
+ Speed: ~50-100ms (vs 4 seconds with LLM)
24
+ Size: ~150MB (vs 4.8GB with LLM)
25
+ """
26
+
27
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
28
+ """
29
+ Initialize the embedding service.
30
+
31
+ Args:
32
+ model_name: Name of the sentence-transformers model to use.
33
+ 'all-MiniLM-L6-v2' is fast, small (22MB), and accurate.
34
+ """
35
+ logger.info(f"Loading embedding model: {model_name}")
36
+
37
+ # Load the pre-trained model
38
+ # This downloads the model on first run (~22MB)
39
+ self.model = SentenceTransformer(model_name)
40
+
41
+ logger.info("Model loaded successfully")
42
+
43
+ # Define all 67 divisions with COMPREHENSIVE keywords
44
+ # Format: (division_name, keywords_for_matching)
45
+ # Updated with exhaustive keywords for better accuracy
46
+ self.divisions_data = [
47
+ # FINANCE
48
+ ("General Accounting Division", "accounting general ledger bookkeeping financial records journal entries posting reconciliation account balancing closing books financial statements invoices receipts vouchers ledgers trial balance balance sheet accounting errors wrong entries posting mistakes account discrepancies accounting software ERP SAP financial systems submit receipts accounting question ledger issue fiscal year closing"),
49
+
50
+ ("Accounts Payable Division", "payable payments vendors bills invoices vendor payments bill processing payment approval invoice matching payment scheduling purchase orders vendor invoices payment vouchers remittance advice late payment vendor complaint unpaid invoice payment delay wrong payment payment systems vendor portal invoice processing pay vendor supplier payment creditors outstanding payments payment terms net 30 payment run"),
51
+
52
+ ("Accounts Receivable Division", "receivable collections revenue income customer payments invoicing customers collecting payments revenue recognition aging reports dunning sales invoices receipts credit memos statements of account collection letters unpaid invoice overdue payment customer not paying bad debt payment dispute billing system collection software customer portal debtors outstanding invoices aging report payment terms cash collection DSO"),
53
+
54
+ ("Financial Reporting & Control Division", "reporting control financial statements disclosure compliance monthly closing financial reporting variance analysis budget vs actual consolidation P&L income statement balance sheet cash flow statement management reports board reports reporting error wrong numbers financial discrepancy statement correction reporting software BI tools consolidation systems Excel models financial report monthly statements quarterly results budget variance GAAP IFRS financial controls internal controls SOX compliance reconciliation"),
55
+
56
+ ("Audit & Financial Analysis", "audit analysis review financial review internal audit financial audit account analysis variance investigation trend analysis ratio analysis audit reports findings recommendations analytical reports management letters audit finding control weakness financial irregularity discrepancy investigation audit software data analytics tools sampling tools need financial analysis audit request investigate transaction variance explanation external audit internal audit SOX testing control testing substantive testing"),
57
+
58
+ # INFORMATION TECHNOLOGY
59
+ ("IT Governance & Quality Division", "IT governance quality standards policies IT strategy IT planning governance framework quality assurance standards compliance IT policies procedures standards governance framework compliance reports policy violation non-compliance governance issue quality problem governance platforms policy management tools compliance software IT policy question IT standards governance requirement quality issue ITIL COBIT ISO 27001 IT framework service management change management IT department information technology IT division policy review IT controls IT quality"),
60
+
61
+ ("Applications Development & Integrations", "development apps integration software programming coding software development app building system integration API development custom development requirements design docs technical specs user stories sprint plans app not working software bug integration failure API issue development request IDE Git DevOps tools integration platforms development frameworks need new app software development integrate systems build application API connection agile scrum CI/CD microservices REST API web services mobile app web app IT department information technology IT division app dev application development software engineering technical development IT development IT team"),
62
+
63
+ ("IT Infrastructure & Operations Div.", "infrastructure servers operations IT systems hardware server management network operations system monitoring infrastructure maintenance capacity planning system docs network diagrams runbooks incident reports change requests server down network problem system slow hardware failure connectivity issue mouse broken keyboard not working monitor problem printer issue laptop problem desktop issue equipment malfunction computer broken screen not working monitoring tools server management network tools ticketing system server not working network down system issue hardware problem my mouse is broken keyboard not responding monitor not working printer offline laptop won't start computer broken data center cloud infrastructure virtualization VMware storage SAN NAS backup disaster recovery computers laptops desktops monitors keyboards mice printers scanners docking stations cables adapters IT department information technology IT operations system controls system performance performance issues slow system data management database management IT infrastructure IT team system monitoring monitoring systems technical infrastructure"),
64
+
65
+ ("Applications Maintenance & Support Div.", "maintenance support help desk application support software support incident management problem resolution ticket handling user support application fixes tickets incident reports knowledge base articles support documentation app not working software error login problem access issue system crash user can't login ticketing system remote support tools monitoring tools help desk software app not working software problem need help can't login error message application support service desk L1 support L2 support incident problem service request bug fix IT department information technology IT support IT help technical support IT assistance application support IT team need help with IT IT issues technical issues IT helpdesk support desk"),
66
+
67
+ ("IT Security Implementation & Operations", "security implementation IT security cyber protection information security security monitoring threat detection vulnerability management security implementation access control security policies incident reports vulnerability assessments security logs security breach hacked account virus malware suspicious activity phishing email password reset firewall antivirus SIEM IDS IPS security tools encryption my account is hacked security problem virus on computer suspicious email password issue can't access system cybersecurity InfoSec data protection encryption authentication authorization MFA SSO identity management IT department information technology IT security cyber security cybersecurity security team security division access issues access control access management security incident security breach IT security team"),
68
+
69
+ ("IT Manager's Office", "IT manager IT leadership IT management technology management IT strategy IT planning resource management vendor management IT budget IT plans strategy documents budget reports vendor contracts escalation IT complaint strategic IT question vendor issue speak to IT manager IT strategy IT escalation major IT issue IT director CIO IT leadership technology roadmap digital transformation"),
70
+
71
+ ("Enterprise Architecture Team", "architecture enterprise architecture IT design system architecture architecture design system planning technology standards solution architecture enterprise planning architecture diagrams blueprints technical standards architecture principles architecture question design review technical standards system design system architecture enterprise design technical architecture solution design EA TOGAF architecture framework reference architecture technology stack design patterns"),
72
+
73
+ ("Reporting & Data Analytics Unit", "analytics data reporting insights business intelligence data analysis report generation dashboard creation data visualization predictive analytics reports dashboards analytics KPI reports data insights visualizations report error wrong data dashboard not working data quality issue BI tools Tableau Power BI SQL data warehouse analytics platforms need a report data analysis dashboard KPI report business intelligence data insights big data data mining machine learning data science ETL data warehouse OLAP IT department information technology data analytics data team data management data systems"),
74
+
75
+ # HUMAN RESOURCES
76
+ ("Rewards & Hr Operations Division", "rewards HR operations salary compensation payroll benefits payroll processing salary administration benefits management compensation planning bonus calculation payslips salary letters benefits statements payroll reports tax documents salary delay payroll error wrong salary salary not received benefits issue bonus question pay stub problem my salary is delayed salary not received payroll error wrong amount paid benefits question bonus calculation pay raise HRIS payroll system benefits platform time and attendance total rewards variable pay incentives stock options pension health insurance leave balance"),
77
+
78
+ ("HR Business Partner", "HR business partner HRBP employee relations HR support employee support HR consultation performance management employee issues HR advice employee files HR policies performance docs employee relations cases employee complaint HR question manager support employee issue workplace problem HRIS case management HR portal HR help employee issue manager question HR advice workplace problem employee engagement organizational development change management workforce planning HR department human resources HR division HR team employee performance performance issues performance review performance management HR help HR support need HR assistance"),
79
+
80
+ ("Talent Development Division", "talent development training learning development employee growth training programs skills development career development learning paths competency building training materials course catalogs development plans training records certificates training request course enrollment development opportunity skill gap LMS e-learning platforms training management systems training course learning opportunity development program skills training professional development L&D upskilling reskilling leadership development technical training soft skills certifications"),
81
+
82
+ ("Od & Talent Acquisition Division", "acquisition recruitment hiring talent acquisition jobs candidates recruiting hiring candidate sourcing interviewing job posting onboarding job descriptions CVs resumes offer letters recruitment reports hiring request job opening recruitment question candidate issue onboarding problem ATS recruitment platforms LinkedIn job boards need to hire job opening recruitment new position hiring process interview candidate talent acquisition sourcing headhunting employer branding candidate experience assessment"),
83
+
84
+ ("Org.Culture & Initiatives Division", "culture initiatives employee engagement organizational culture workplace culture engagement programs culture building employee initiatives recognition programs wellness programs engagement surveys culture reports initiative proposals recognition materials engagement issue culture problem initiative request employee morale employee engagement culture initiative recognition program employee wellness team building employee experience values mission vision culture transformation employee satisfaction DEI"),
85
+
86
+ # LEGAL
87
+ ("Legal Agreements and Consultancy Division", "legal agreements consultancy legal advice contracts consultation legal review contract drafting legal consultation agreement negotiation legal advice contracts agreements legal opinions legal memos NDAs MOUs legal question contract review needed agreement issue legal advice request need legal advice contract review legal question agreement help legal consultation legal counsel commercial law corporate law legal advisory contract law"),
88
+
89
+ ("Contracts & Mortgages and Guarantees Division", "contracts mortgages guarantees security collateral mortgage management guarantee processing contract administration collateral management security documentation mortgage documents guarantee letters security agreements pledge documents collateral records mortgage question guarantee issue security problem contract question mortgage document guarantee letter security agreement collateral question pledge loan security real estate property liens hypothecation financial guarantees"),
90
+
91
+ ("Cases Division", "cases litigation legal cases lawsuits disputes court litigation management case handling dispute resolution court proceedings legal claims lawsuits court filings legal claims case files judgments settlements legal case lawsuit dispute court matter legal claim legal case lawsuit dispute court matter litigation legal claim arbitration litigation arbitration mediation legal proceedings court case claims management judgments"),
92
+
93
+ # COMMUNICATION
94
+ ("Public Relations & Media Division", "PR media public relations press communications announcements media relations press releases public announcements media monitoring crisis communication press releases media statements communication materials press kits media inquiry press question public statement needed PR crisis media request press release public announcement PR question media inquiry corporate communications external communications media coverage press conference spokesperson"),
95
+
96
+ ("Internal Communication Unit", "internal communication employee communication announcements internal comms employee messaging internal announcements town halls newsletters intranet management internal memos newsletters announcements employee updates town hall materials communication request announcement needed employee messaging internal news employee announcement internal communication newsletter staff message company news employee communications change communications internal messaging intranet employee engagement"),
97
+
98
+ # BUSINESS DEVELOPMENT
99
+ ("Marketing Division", "marketing branding promotion advertising campaigns brand management marketing campaigns market promotion advertising digital marketing content marketing marketing plans campaign materials brand guidelines promotional materials marketing reports marketing request branding question campaign support promotional material marketing support branding promotion advertising marketing campaign social media digital marketing SEO SEM social media marketing content marketing email marketing events"),
100
+
101
+ ("Partnerships Dev&Advisory Services Div", "partnerships advisory business partnerships strategic partnerships alliances partnership development alliance management collaboration partnership agreements joint ventures partnership agreements MOU collaboration agreements partnership proposals partnership opportunity collaboration request alliance question partnership opportunity business collaboration strategic alliance joint venture partnership strategic partnerships business alliances co-marketing channel partnerships ecosystem"),
102
+
103
+ # STRATEGIC PLANNING
104
+ ("Strategy Division", "strategy strategic planning business strategy corporate strategy strategic planning strategy development business planning strategic initiatives roadmap planning strategic plans business plans strategy documents roadmaps strategic initiatives strategy question strategic planning business direction strategic planning business strategy strategic initiative corporate strategy long-term planning corporate strategy business model strategic objectives strategic goals vision mission"),
105
+
106
+ ("Corporate Performance Division", "performance corporate performance KPIs metrics objectives performance management KPI tracking metrics monitoring objective setting performance review KPI reports scorecards performance dashboards objective tracking performance reviews performance tracking KPI question metrics issue objective setting KPI tracking performance metrics corporate performance objectives scorecards BSC balanced scorecard OKRs performance indicators strategic metrics targets goals"),
107
+
108
+ ("Knowledge Management Unit", "knowledge management information management documentation knowledge sharing knowledge capture documentation knowledge sharing content management knowledge repository knowledge base documentation procedures best practices lessons learned documentation request knowledge access information search knowledge base documentation procedure how to best practices process documentation knowledge base wiki document management content management information architecture"),
109
+
110
+ # PROJECTS
111
+ ("Project Management Division", "project management projects PMO project planning project execution project planning project execution project monitoring project control project delivery project plans schedules WBS project charters status reports project documentation project issue project delay project question project planning project management project planning PMO project delivery project status project delivery PMO PMP project lifecycle project portfolio agile waterfall project governance"),
112
+
113
+ ("Projects Consultancy Division", "projects consultancy project advisory project consultation project services project consulting advisory services project expertise project guidance project assessment consulting reports project assessments advisory recommendations project studies project consultation needed project advice expert guidance project consultation project advisory project expert project guidance project assessment consulting advisory project expertise technical assistance project evaluation"),
114
+
115
+ ("Construction Consultancy Division", "construction building construction consultancy construction projects engineering construction planning building supervision construction management site supervision construction quality construction plans building specs drawings blueprints construction reports inspection reports construction issue building problem site question construction quality construction project building construction construction supervision site management engineering civil engineering structural engineering construction management building codes specifications BOQ"),
116
+
117
+ # MARKET RESEARCH
118
+ ("Market Research Div.", "market research research market analysis market intelligence market research data collection market surveys research analysis market intelligence research reports market studies survey results research findings market analysis research request market data needed research question market research market data research study market analysis market intelligence market trends market research surveys focus groups market data industry analysis competitive intelligence"),
119
+
120
+ ("Market Studies Division", "market studies industry analysis market assessment market evaluation market studies industry research market assessment feasibility studies market evaluation market study reports industry reports assessment reports feasibility studies market study request industry analysis needed feasibility question market study industry analysis market assessment feasibility study market evaluation industry research market sizing market segmentation market opportunity market attractiveness"),
121
+
122
+ ("Business Intelligence Div.", "business intelligence market intelligence competitive intelligence market insights intelligence gathering competitor analysis market monitoring intelligence reporting intelligence reports competitor profiles market briefings intelligence updates intelligence request competitor information market intelligence business intelligence competitor analysis market intelligence competitive intelligence market insights CI competitive analysis market monitoring industry intelligence strategic intelligence"),
123
+
124
+ # FACILITIES
125
+ ("Facilities Management Division", "facilities building management office management facilities services facility maintenance building operations space management facility services office management maintenance schedules facility reports work orders service requests facility problem building issue office space maintenance request AC not working lights not working office equipment furniture request space issue cleaning request AC not working office too cold lights broken need furniture office space facility maintenance building problem room too hot chair broken desk request CMMS facility management software work order systems HVAC air conditioning heating ventilation lighting furniture office supplies workspace meeting rooms office equipment janitorial cleaning waste management"),
126
+
127
+ ("Documents and Administrative Communications Center", "documents administrative communications center document management records document processing records management filing archiving document distribution mail handling official documents correspondence letters memos records archives document request filing question records access document retrieval need a document official letter document filing records request archive access correspondence records management archiving filing system document control correspondence management official documents document filing document archiving paper documents document center administrative documents"),
128
+
129
+ ("Security and Safety Division", "security safety protection guards access control security operations access control visitor management security monitoring emergency response security reports incident reports visitor logs access logs security procedures security incident lost badge access card problem visitor escort security concern suspicious activity emergency safety hazard lost my badge access card not working need visitor pass security escort security incident emergency safety concern suspicious person physical security badge ID card access card visitor management CCTV guards patrols emergency procedures evacuation physical security building security premises security facility security incident report security event safety incident safety event"),
130
+
131
+ ("Industrial Safety & Loss Prevention Div", "industrial safety loss prevention occupational safety workplace safety HSE safety management risk assessment safety inspections incident investigation loss prevention safety reports incident reports risk assessments safety procedures investigation reports safety incident accident injury safety hazard unsafe condition near miss safety incident workplace accident safety hazard injury report unsafe condition PPE safety training HSE occupational health safety compliance OSHA safety regulations PPE accident prevention safety training"),
132
+
133
+ # PURCHASING
134
+ ("Procurements Contracts and Vendors Division", "procurement contracts vendors suppliers purchasing sourcing procurement vendor management contract management supplier management sourcing tendering purchase orders contracts RFP RFQ vendor agreements tender documents vendor issue contract question procurement request supplier problem procurement vendor contract supplier agreement tender RFP purchase order sourcing strategic sourcing supplier relationship contract negotiation vendor evaluation procurement process"),
135
+
136
+ ("Purchasing Division", "purchasing buying procurement purchase orders requisitions purchasing buying order processing purchase requisitions goods receipt purchasing approval purchase orders requisitions quotes purchase requests delivery notes purchase request buying question order status delivery problem purchase approval need to buy purchase request order status buying approval purchase something get a quote purchase requisition PO purchase approval ordering buying process goods receipt"),
137
+
138
+ # GOVERNANCE
139
+ ("Enterprise Governance Division", "enterprise governance governance policies corporate governance board governance governance framework policy development board governance corporate governance governance compliance policies governance framework board papers governance reports charters governance question policy issue compliance question governance requirement governance corporate policies board governance governance framework governance compliance corporate governance board of directors governance structure governance best practices King IV"),
140
+
141
+ ("Compliance Regulatory Division", "regulatory compliance regulations regulatory compliance legal compliance regulatory compliance regulation monitoring compliance assessment regulatory reporting compliance review compliance reports regulatory filings compliance certificates regulatory updates regulatory requirement compliance question regulation change compliance issue regulatory compliance regulations compliance requirement regulatory filing compliance report regulatory requirements industry regulations compliance standards regulatory framework regulatory authorities"),
142
+
143
+ ("Compliance Operations Division", "compliance operations compliance monitoring compliance management compliance controls compliance monitoring compliance testing compliance controls compliance verification compliance tracking compliance reports monitoring reports compliance checklists test results compliance logs compliance breach control failure compliance issue monitoring finding compliance monitoring compliance testing compliance controls compliance verification compliance tracking compliance program compliance testing monitoring controls compliance assurance compliance activities"),
144
+
145
+ ("Operational Risk Management Division", "operational risk risk management operational risk management risk controls risk identification risk assessment risk mitigation control testing risk monitoring risk registers risk assessments control matrices risk reports mitigation plans operational risk risk event control failure risk issue operational failure operational risk risk management risk assessment control issue risk event operational failure risk framework COSO ERM risk appetite risk tolerance key risk indicators KRIs system controls controls management risk control internal controls control framework assessment needed risk assessment risk division risk department risk team control systems controls review"),
146
+
147
+ ("Financial Risk Management Division", "financial risk risk management credit risk market risk liquidity risk financial risk assessment risk modeling stress testing scenario analysis risk measurement risk reports stress test results risk models VaR reports risk metrics financial risk market risk credit risk event liquidity issue financial risk market risk credit risk liquidity risk risk modeling stress testing Basel capital adequacy VaR credit risk market risk liquidity risk ALM risk metrics"),
148
+
149
+ ("Cybersecurity Governance Unit", "cybersecurity governance security governance information security governance security policies security governance security policy development security framework security standards security compliance security policies security framework security standards security guidelines security charters security policy question security governance security framework security compliance cybersecurity governance security policies security framework information security governance security standards ISO 27001 NIST security framework security governance information security management ISMS"),
150
+
151
+ ("Cybersecurity Defense Unit", "cybersecurity defense security operations cyber defense threat detection security monitoring threat monitoring incident response security operations threat hunting vulnerability management security incidents threat reports incident response plans security alerts IOCs cyber attack security breach hacked malware virus phishing ransomware data breach security incident cyber attack hacked virus phishing email ransomware security incident data breach malware suspicious email SOC security operations center threat intelligence incident response cyber threats APT zero-day exploit vulnerability"),
152
+
153
+ # CREDIT
154
+ ("Credit Risk Division", "credit risk credit assessment credit evaluation credit analysis default risk credit assessment credit evaluation credit scoring default probability credit rating credit review credit reports credit assessments credit scores rating reports credit analysis credit risk concern default risk credit quality credit deterioration credit risk credit assessment credit evaluation default risk credit quality credit rating PD LGD EAD credit scoring credit rating credit underwriting NPL non-performing loans"),
155
+
156
+ ("Credit Relationships Division", "credit relationships customer relations client management customer service account management customer relationship management client servicing account management customer support relationship building customer profiles relationship reports service records customer communications customer complaint client issue relationship problem customer service customer service client support customer complaint account manager relationship manager customer issue CRM customer relationship client servicing relationship manager account management customer satisfaction"),
157
+
158
+ ("Credit Control Team - A", "credit control collections monitoring credit monitoring team A credit monitoring collection activities payment follow-up credit limits exposure monitoring collection reports payment schedules credit memos monitoring reports dunning letters overdue payment collection issue credit limit payment delay delinquency credit control collections overdue payment payment monitoring credit limit collection collections receivables management credit monitoring payment tracking delinquency write-off"),
159
+
160
+ ("Credit Control Team - B", "credit control collections monitoring credit monitoring team B credit monitoring collection activities payment follow-up credit limits exposure monitoring collection reports payment schedules credit memos monitoring reports dunning letters overdue payment collection issue credit limit payment delay delinquency credit control collections overdue payment payment monitoring credit limit collection collections receivables management credit monitoring payment tracking delinquency write-off"),
161
+
162
+ ("Collection Team", "collection collections recovery debt collection payment recovery debt collection payment recovery collection activities recovery process collection calls collection letters payment plans recovery reports collection logs non-payment collection case recovery issue difficult customer debt collection payment recovery collection recover payment non-paying customer collections debt recovery recovery payment collection delinquent accounts bad debt"),
163
+
164
+ ("Loan Follow-Up Team", "loan follow-up loan monitoring loan servicing loan administration loan monitoring loan servicing payment follow-up loan administration loan review loan files payment schedules loan reports servicing records loan reviews loan payment loan question payment issue loan servicing loan problem loan payment loan help loan issue loan servicing loan monitoring loan question payment schedule loan servicing loan administration loan payments loan portfolio loan monitoring loan review disbursement"),
165
+
166
+ ("C&Lm Info Team", "credit info information credit information credit data information management credit data information services data management credit information credit reports information reports data files credit information information request credit data information access credit information credit data information request credit reports data access credit bureau credit information credit data information services"),
167
+
168
+ # PORTFOLIO
169
+ ("Portfolio Manager'S Office", "portfolio manager portfolio management office portfolio leadership portfolio management portfolio strategy portfolio oversight portfolio governance portfolio reports portfolio strategy portfolio reviews management reports portfolio question portfolio strategy portfolio oversight portfolio manager portfolio strategy portfolio management portfolio oversight portfolio governance portfolio leadership portfolio strategy asset management"),
170
+
171
+ ("Portfolio Division", "portfolio portfolio management asset management investment portfolio portfolio management asset management portfolio monitoring portfolio optimization portfolio reporting portfolio reports asset statements portfolio analysis performance reports portfolio question asset issue portfolio performance investment question portfolio asset management portfolio performance investment portfolio portfolio monitoring my portfolio portfolio management asset allocation investment management portfolio optimization asset management portfolio performance"),
172
+
173
+ ("Special Assets Division", "special assets distressed assets problem assets asset recovery asset recovery workout restructuring distressed asset management asset resolution workout plans restructuring plans asset reports recovery reports problem asset distressed loan asset recovery troubled asset special assets problem asset distressed asset asset recovery workout restructuring NPL management asset resolution workout restructuring distressed debt asset recovery"),
174
+
175
+ # EXCELLENCE
176
+ ("Programs Delivery Operations Division", "programs delivery operations program execution program operations program delivery program execution operational delivery program operations program implementation program reports delivery schedules operational reports program status program delivery operational issue program execution delivery problem program delivery program operations program execution delivery operations program implementation program management program delivery operational excellence program execution"),
177
+
178
+ ("Customer Excellence Division", "customer excellence customer service customer experience customer satisfaction customer service customer experience management service excellence customer complaints customer feedback service quality customer feedback service reports satisfaction surveys complaint reports NPS reports customer complaint service issue poor service customer dissatisfaction complaint bad experience customer complaint service problem poor service complaint customer service not satisfied bad experience complaint about service customer satisfaction NPS customer experience CX service quality customer feedback complaints handling customer care"),
179
+
180
+ ("Programs Design And Needs Assessment Division", "programs design needs assessment program development program planning program design needs analysis program planning program development requirements gathering needs assessments program designs requirements documents program proposals program design needs assessment program planning development request program design needs assessment program development program planning requirements analysis program development needs analysis program design program planning feasibility"),
181
+
182
+ ("Programs Evaluation & Quality Assurance Division", "programs evaluation quality assurance program assessment QA program review program evaluation quality assurance program assessment program review performance evaluation evaluation reports QA reports assessment reports review reports program audits program evaluation quality issue assessment request program review program evaluation quality assurance program assessment QA program review program quality program evaluation quality control QA quality assurance program effectiveness program impact"),
183
+
184
+ ("Product Dev For National Priorities Div", "product development national priorities new products product innovation product development new product creation product innovation product design national programs product proposals product specs development plans product roadmaps new product product development product idea innovation request new product product development product innovation develop new product national priorities product idea product management product innovation new products product design product strategy national initiatives"),
185
+
186
+ ("Product Dev For Customer Empowerment Div", "product development customer empowerment customer products customer-focused products customer product development customer-centric design product innovation customer empowerment programs product proposals customer research product specs customer feedback customer product product for customers customer empowerment customer-focused development customer products customer empowerment customer-focused products products for customers customer innovation customer empowerment customer-centric products customer solutions customer value"),
187
+
188
+ ("Solutions & Design Division", "solutions design solution design solution development solutions architecture solution design solution development solution architecture solution delivery design thinking solution designs design documents solution proposals solution blueprints solution design solution request design question solution development solution design solution development design solution solutions custom solution solution architecture design thinking solution engineering solution delivery custom solutions"),
189
+
190
+ # AUDIT
191
+ ("Operations Audit Team", "operations audit operational audit audit internal audit operational review operational auditing process audit operational review compliance audit operational assessment audit reports audit findings audit recommendations operational audit reports audit request operational audit audit finding audit question operations audit operational audit audit request internal audit operational review internal audit operational audit process audit compliance audit operational controls"),
192
+
193
+ # ACADEMY
194
+ ("Academy Strategic Partnerships Division", "academy training education learning development partnerships training education programs learning professional development courses workshops certifications partnerships course catalogs training materials certificates learning plans partnership agreements training request course enrollment certification learning opportunity education program training education learning courses workshop certification professional development academy programs training courses SIDF Academy corporate university training programs learning center professional development certifications workshops seminars e-learning leadership development"),
195
+
196
+ # EXECUTIVE
197
+ ("CEO Office", "CEO chief executive executive office CEO office leadership executive management strategic leadership executive decisions CEO communications executive governance executive reports board papers CEO communications strategic documents executive escalation CEO office executive question strategic matter CEO executive office chief executive CEO office executive escalation top management C-suite executive leadership CEO chief executive officer executive management strategic leadership"),
198
+ ]
199
+
200
+ # Extract just the division names and search texts
201
+ self.division_names = [div[0] for div in self.divisions_data]
202
+ self.division_search_texts = [
203
+ f"{div[0]} {div[1]}" for div in self.divisions_data
204
+ ]
205
+
206
+ logger.info(f"Pre-encoding {len(self.division_names)} divisions...")
207
+
208
+ # PRE-ENCODE all divisions (this is the magic!)
209
+ # This happens once at startup, then queries are super fast
210
+ self.division_embeddings = self.model.encode(
211
+ self.division_search_texts,
212
+ convert_to_numpy=True,
213
+ show_progress_bar=True
214
+ )
215
+
216
+ logger.info(f"✓ Encoded {len(self.division_names)} divisions")
217
+ logger.info(f"Embedding shape: {self.division_embeddings.shape}")
218
+
219
+ # ALSO ENCODE DEPARTMENTS
220
+ # Get unique departments and build department → divisions mapping
221
+ from contacts_data import get_all_contacts
222
+ from collections import defaultdict
223
+
224
+ contacts = get_all_contacts()
225
+ dept_to_divisions = defaultdict(set)
226
+
227
+ for contact in contacts:
228
+ dept_to_divisions[contact["department"]].add(contact["division"])
229
+
230
+ self.dept_to_divisions = {dept: list(divs) for dept, divs in dept_to_divisions.items()}
231
+ self.department_names = list(self.dept_to_divisions.keys())
232
+
233
+ # Create search texts for departments (department name + common keywords)
234
+ self.department_search_texts = []
235
+ for dept in self.department_names:
236
+ # Add department name and common keywords
237
+ search_text = f"{dept} department team group unit"
238
+ self.department_search_texts.append(search_text)
239
+
240
+ logger.info(f"Pre-encoding {len(self.department_names)} departments...")
241
+ self.department_embeddings = self.model.encode(
242
+ self.department_search_texts,
243
+ convert_to_numpy=True,
244
+ show_progress_bar=False
245
+ )
246
+
247
+ logger.info(f"✓ Encoded {len(self.department_names)} departments")
248
+ logger.info("EmbeddingService ready!")
249
+
250
+ def find_division(self, query: str, top_k: int = 3) -> List[ExtractedInfo]:
251
+ """
252
+ Find the best matching divisions for a query.
253
+ Also checks department-level matches and expands them to divisions.
254
+
255
+ Args:
256
+ query: User's search query (e.g., "app development", "HR help", "Information Technology")
257
+ top_k: Number of top matches to return (default: 3)
258
+
259
+ Returns:
260
+ List of ExtractedInfo objects with division and confidence scores
261
+
262
+ How it works:
263
+ 1. Encode the query into a vector (fast: ~10ms)
264
+ 2. Calculate similarity with all division vectors (fast: ~5ms)
265
+ 3. ALSO calculate similarity with all department vectors
266
+ 4. If department match is stronger, expand to all divisions in that department
267
+ 5. Return top matches sorted by similarity score
268
+
269
+ Total time: ~15-50ms
270
+ """
271
+ logger.info(f"Processing query: {query}")
272
+
273
+ # STEP 1: Encode the query
274
+ query_embedding = self.model.encode([query], convert_to_numpy=True)
275
+
276
+ # STEP 2: Calculate similarity with all divisions
277
+ division_similarities = cosine_similarity(
278
+ query_embedding,
279
+ self.division_embeddings
280
+ )[0]
281
+
282
+ # STEP 2B: ALSO calculate similarity with departments
283
+ department_similarities = cosine_similarity(
284
+ query_embedding,
285
+ self.department_embeddings
286
+ )[0]
287
+
288
+ # STEP 3: Check if any department has significantly better match than divisions
289
+ best_division_similarity = np.max(division_similarities)
290
+ best_dept_similarity = np.max(department_similarities)
291
+
292
+ # Convert to confidence (0-1 range)
293
+ best_division_conf = (best_division_similarity + 1) / 2
294
+ best_dept_conf = (best_dept_similarity + 1) / 2
295
+
296
+ logger.info(f"Best division match confidence: {best_division_conf:.2f}")
297
+ logger.info(f"Best department match confidence: {best_dept_conf:.2f}")
298
+
299
+ results = []
300
+
301
+ # If department match is better (even slightly >= 0.01 higher), use department
302
+ # Lower threshold (0.01 instead of 0.05) to catch department-level queries
303
+ # e.g., "Information Technology" should expand to all IT divisions
304
+ if best_dept_conf > best_division_conf and (best_dept_conf - best_division_conf) >= 0.01:
305
+ best_dept_idx = np.argmax(department_similarities)
306
+ dept_name = self.department_names[best_dept_idx]
307
+ divisions_in_dept = self.dept_to_divisions[dept_name]
308
+
309
+ logger.info(f"✓ Department match: {dept_name} ({best_dept_conf:.2f}) - Expanding to ALL {len(divisions_in_dept)} divisions")
310
+
311
+ # Return ALL divisions in this department with the department's confidence
312
+ # This ensures contact search can find people across all divisions in the department
313
+ for division_name in divisions_in_dept:
314
+ results.append(ExtractedInfo(
315
+ division=division_name,
316
+ department=dept_name,
317
+ confidence=round(best_dept_conf, 2)
318
+ ))
319
+
320
+ logger.info(f" - {division_name} (dept match, confidence: {best_dept_conf:.2f})")
321
+
322
+ else:
323
+ # Use regular division matching
324
+ top_indices = np.argsort(division_similarities)[::-1][:top_k]
325
+
326
+ for idx in top_indices:
327
+ division_name = self.division_names[idx]
328
+ similarity_score = float(division_similarities[idx])
329
+ confidence = (similarity_score + 1) / 2
330
+
331
+ # Get parent department
332
+ department_name = get_department_name(division_name)
333
+
334
+ results.append(ExtractedInfo(
335
+ division=division_name,
336
+ department=department_name,
337
+ confidence=round(confidence, 2)
338
+ ))
339
+
340
+ logger.info(
341
+ f"Match: {division_name} [{department_name}] "
342
+ f"(similarity: {similarity_score:.3f}, "
343
+ f"confidence: {confidence:.2f})"
344
+ )
345
+
346
+ logger.info(f"✓ Found {len(results)} matches")
347
+ return results
name_extraction_service.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # name_extraction_service.py
2
+ import logging
3
+ from typing import List, Dict, Any, Optional
4
+ from gliner import GLiNER
5
+
6
+ # Set up logging
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class NameExtractor:
12
+ """
13
+ Service for extracting person names from text using GLiNER.
14
+
15
+ GLiNER is a zero-shot NER model that can extract entities without
16
+ being limited to predefined entity types. It's especially good for:
17
+ - Multilingual name extraction (English + Arabic)
18
+ - Flexible entity extraction
19
+ - Lightweight and fast (~100-200ms)
20
+
21
+ Size: ~150MB model
22
+ Speed: ~100-200ms per query
23
+ """
24
+
25
+ def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
26
+ """
27
+ Initialize the name extraction service.
28
+
29
+ Args:
30
+ model_name: GLiNER model to use. Options:
31
+ - "urchade/gliner_small-v2.1" (150MB, balanced)
32
+ - "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
33
+ - "urchade/gliner_large-v2.1" (larger, more accurate)
34
+ """
35
+ logger.info(f"Loading GLiNER model: {model_name}")
36
+
37
+ # Load the pre-trained model
38
+ # This downloads the model on first run (~150MB)
39
+ self.model = GLiNER.from_pretrained(model_name)
40
+
41
+ # Define the entity labels we want to extract
42
+ self.labels = ["person", "name", "employee"]
43
+
44
+ logger.info(f"✓ GLiNER model loaded successfully")
45
+ logger.info(f"Entity labels: {self.labels}")
46
+
47
+ def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
48
+ """
49
+ Extract person names from text.
50
+
51
+ Args:
52
+ text: Input text (e.g., "find Ahmed in IT")
53
+ threshold: Confidence threshold (0-1). Lower = more names but less precise.
54
+ Default 0.3 is good for most cases.
55
+
56
+ Returns:
57
+ List of extracted names
58
+
59
+ Example:
60
+ >>> extractor.extract_names("find Ahmed Hassan in IT")
61
+ ['Ahmed Hassan']
62
+
63
+ >>> extractor.extract_names("connect me with Sarah from HR")
64
+ ['Sarah']
65
+ """
66
+ logger.info(f"Extracting names from: {text}")
67
+
68
+ # Predict entities using GLiNER
69
+ entities = self.model.predict_entities(
70
+ text,
71
+ self.labels,
72
+ threshold=threshold
73
+ )
74
+
75
+ # Extract just the text of person entities
76
+ names = [entity["text"] for entity in entities]
77
+
78
+ # Remove duplicates while preserving order
79
+ unique_names = list(dict.fromkeys(names))
80
+
81
+ logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}")
82
+
83
+ return unique_names
84
+
85
+ def extract_names_with_context(
86
+ self,
87
+ text: str,
88
+ threshold: float = 0.3
89
+ ) -> List[Dict[str, Any]]:
90
+ """
91
+ Extract person names with additional context (position, confidence).
92
+
93
+ Args:
94
+ text: Input text
95
+ threshold: Confidence threshold (0-1)
96
+
97
+ Returns:
98
+ List of dictionaries with name details:
99
+ [
100
+ {
101
+ "name": "Ahmed Hassan",
102
+ "start": 5,
103
+ "end": 17,
104
+ "confidence": 0.95,
105
+ "label": "person"
106
+ }
107
+ ]
108
+ """
109
+ logger.info(f"Extracting names with context from: {text}")
110
+
111
+ # Predict entities
112
+ entities = self.model.predict_entities(
113
+ text,
114
+ self.labels,
115
+ threshold=threshold
116
+ )
117
+
118
+ # Format results
119
+ results = []
120
+ for entity in entities:
121
+ results.append({
122
+ "name": entity["text"],
123
+ "start": entity["start"],
124
+ "end": entity["end"],
125
+ "confidence": round(entity["score"], 2),
126
+ "label": entity["label"]
127
+ })
128
+
129
+ logger.info(f"✓ Found {len(results)} name(s) with context")
130
+
131
+ return results
132
+
133
+ def extract_from_query(
134
+ self,
135
+ query: str,
136
+ extract_divisions: bool = False
137
+ ) -> Dict[str, Any]:
138
+ """
139
+ Extract names and optionally division keywords from a query.
140
+
141
+ Args:
142
+ query: User query text
143
+ extract_divisions: Whether to also extract division/department mentions
144
+
145
+ Returns:
146
+ Dictionary with extracted information:
147
+ {
148
+ "names": ["Ahmed", "Sarah"],
149
+ "has_names": True,
150
+ "count": 2,
151
+ "divisions": ["IT", "HR"] (if extract_divisions=True)
152
+ }
153
+ """
154
+ # Extract names
155
+ names = self.extract_names(query)
156
+
157
+ result = {
158
+ "names": names,
159
+ "has_names": len(names) > 0,
160
+ "count": len(names)
161
+ }
162
+
163
+ # Optionally extract division keywords
164
+ if extract_divisions:
165
+ # Common division/department keywords
166
+ division_keywords = [
167
+ "IT", "HR", "Finance", "Legal", "Accounting",
168
+ "Marketing", "Sales", "Operations", "Engineering",
169
+ "Security", "Facilities", "Purchasing", "Audit"
170
+ ]
171
+
172
+ query_upper = query.upper()
173
+ found_divisions = [
174
+ kw for kw in division_keywords
175
+ if kw in query_upper
176
+ ]
177
+
178
+ result["divisions"] = found_divisions
179
+ result["has_divisions"] = len(found_divisions) > 0
180
+
181
+ return result
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Requirements
2
+ # Optimized for Gradio deployment
3
+
4
+ # Gradio for the web interface
5
+ gradio>=4.0.0
6
+
7
+ # AI/ML Core Libraries
8
+ sentence-transformers>=2.2.0 # For division matching embeddings
9
+ openai-whisper>=20231117 # For speech-to-text
10
+ gliner>=0.1.0 # For name extraction
11
+
12
+ # Scientific Computing
13
+ numpy>=1.24.0
14
+ scipy>=1.10.0 # For audio file handling
15
+
16
+ # Text Processing
17
+ torch>=2.0.0 # Required by models (CPU version for HF Spaces)
18
+
19
+ # Standard Libraries (usually included but good to specify)
20
+ # Note: logging, os, tempfile, pathlib, typing, difflib, re are standard library
voice_processing_service.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # voice_processing_service.py
2
+ import logging
3
+ import whisper
4
+ import os
5
+ import tempfile
6
+ from typing import Dict, Any, Optional
7
+ from pathlib import Path
8
+
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class VoiceProcessor:
15
+ """
16
+ Service for processing voice queries with speech-to-text and translation.
17
+
18
+ Features:
19
+ - Speech-to-text using OpenAI Whisper
20
+ - Automatic language detection
21
+ - Arabic-to-English translation
22
+ - Supports 99+ languages
23
+ - Works offline
24
+
25
+ Whisper Model Sizes:
26
+ - tiny: 39M params, ~1GB RAM, fast but less accurate
27
+ - base: 74M params, ~1GB RAM, balanced (RECOMMENDED for quick start)
28
+ - small: 244M params, ~2GB RAM, good accuracy
29
+ - medium: 769M params, ~5GB RAM, better accuracy
30
+ - large: 1550M params, ~10GB RAM, best accuracy
31
+ """
32
+
33
+ def __init__(self, model_size: str = "base"):
34
+ """
35
+ Initialize the voice processing service.
36
+
37
+ Args:
38
+ model_size: Whisper model to use. Options:
39
+ - "tiny" (39M) - Fast, less accurate
40
+ - "base" (74M) - Balanced, recommended for development
41
+ - "small" (244M) - Good accuracy
42
+ - "medium" (769M) - Better accuracy
43
+ - "large" (1550M) - Best accuracy, slowest
44
+ """
45
+ logger.info(f"Loading Whisper model: {model_size}")
46
+ logger.info("This may take a few minutes on first run (downloading model)...")
47
+
48
+ # Load Whisper model
49
+ # This downloads the model on first run
50
+ self.model = whisper.load_model(model_size)
51
+
52
+ self.model_size = model_size
53
+ logger.info(f"✓ Whisper model '{model_size}' loaded successfully")
54
+ logger.info(f"Supported languages: 99+ (auto-detected)")
55
+
56
+ def transcribe_audio(
57
+ self,
58
+ audio_path: str,
59
+ language: Optional[str] = None
60
+ ) -> Dict[str, Any]:
61
+ """
62
+ Transcribe audio file in its original language.
63
+
64
+ Args:
65
+ audio_path: Path to audio file (mp3, wav, m4a, etc.)
66
+ language: Optional language code (e.g., "en", "ar"). If None, auto-detect.
67
+
68
+ Returns:
69
+ Dictionary with transcription results:
70
+ {
71
+ "text": "transcribed text",
72
+ "language": "en",
73
+ "language_name": "English",
74
+ "confidence": 0.95
75
+ }
76
+ """
77
+ logger.info(f"Transcribing audio: {audio_path}")
78
+
79
+ # Transcribe with Whisper
80
+ result = self.model.transcribe(
81
+ audio_path,
82
+ language=language,
83
+ fp16=False # Use fp32 for better compatibility
84
+ )
85
+
86
+ transcription = {
87
+ "text": result["text"].strip(),
88
+ "language": result["language"],
89
+ "language_name": self._get_language_name(result["language"]),
90
+ "confidence": self._calculate_confidence(result)
91
+ }
92
+
93
+ logger.info(f"✓ Transcribed: '{transcription['text'][:100]}...'")
94
+ logger.info(f" Language: {transcription['language_name']} ({transcription['language']})")
95
+ logger.info(f" Confidence: {transcription['confidence']:.2f}")
96
+
97
+ return transcription
98
+
99
+ def translate_to_english(self, audio_path: str) -> Dict[str, Any]:
100
+ """
101
+ Transcribe audio and translate to English (if not already English).
102
+
103
+ This is optimized for the use case where you always want English output,
104
+ regardless of the input language.
105
+
106
+ Args:
107
+ audio_path: Path to audio file
108
+
109
+ Returns:
110
+ Dictionary with translation results:
111
+ {
112
+ "original_text": "النص الأصلي",
113
+ "english_text": "translated text",
114
+ "original_language": "ar",
115
+ "original_language_name": "Arabic",
116
+ "was_translated": true
117
+ }
118
+ """
119
+ logger.info(f"Processing audio for English output: {audio_path}")
120
+
121
+ # First, transcribe in original language to detect it
122
+ original = self.model.transcribe(audio_path, fp16=False)
123
+
124
+ # Then translate to English
125
+ translated = self.model.transcribe(
126
+ audio_path,
127
+ task="translate", # This translates to English
128
+ fp16=False
129
+ )
130
+
131
+ result = {
132
+ "original_text": original["text"].strip(),
133
+ "english_text": translated["text"].strip(),
134
+ "original_language": original["language"],
135
+ "original_language_name": self._get_language_name(original["language"]),
136
+ "was_translated": original["language"] != "en"
137
+ }
138
+
139
+ if result["was_translated"]:
140
+ logger.info(f"✓ Detected {result['original_language_name']}, translated to English")
141
+ logger.info(f" Original: '{result['original_text'][:100]}...'")
142
+ logger.info(f" English: '{result['english_text'][:100]}...'")
143
+ else:
144
+ logger.info(f"✓ Already in English, no translation needed")
145
+
146
+ return result
147
+
148
+ def process_voice_query(self, audio_path: str) -> Dict[str, Any]:
149
+ """
150
+ Complete pipeline: transcribe, translate if needed, return query text.
151
+
152
+ This is the main method for the voice assistant use case.
153
+
154
+ Args:
155
+ audio_path: Path to audio file
156
+
157
+ Returns:
158
+ Dictionary ready for division extraction:
159
+ {
160
+ "query": "english text for processing",
161
+ "original_text": "original text if different",
162
+ "language": "ar",
163
+ "language_name": "Arabic",
164
+ "was_translated": true,
165
+ "audio_duration": 5.2
166
+ }
167
+ """
168
+ logger.info(f"Processing voice query: {audio_path}")
169
+
170
+ # Get audio duration
171
+ audio_info = whisper.load_audio(audio_path)
172
+ duration = len(audio_info) / whisper.audio.SAMPLE_RATE
173
+
174
+ # Translate to English (works for all languages)
175
+ result = self.translate_to_english(audio_path)
176
+
177
+ return {
178
+ "query": result["english_text"], # Always English for processing
179
+ "original_text": result["original_text"],
180
+ "language": result["original_language"],
181
+ "language_name": result["original_language_name"],
182
+ "was_translated": result["was_translated"],
183
+ "audio_duration": round(duration, 2)
184
+ }
185
+
186
+ def _get_language_name(self, lang_code: str) -> str:
187
+ """Get full language name from code."""
188
+ language_names = {
189
+ "en": "English",
190
+ "ar": "Arabic",
191
+ "es": "Spanish",
192
+ "fr": "French",
193
+ "de": "German",
194
+ "zh": "Chinese",
195
+ "ja": "Japanese",
196
+ "ko": "Korean",
197
+ "ru": "Russian",
198
+ "pt": "Portuguese",
199
+ "it": "Italian",
200
+ "nl": "Dutch",
201
+ "tr": "Turkish",
202
+ "pl": "Polish",
203
+ "uk": "Ukrainian",
204
+ "vi": "Vietnamese",
205
+ "th": "Thai",
206
+ "hi": "Hindi",
207
+ "ur": "Urdu",
208
+ # Add more as needed
209
+ }
210
+ return language_names.get(lang_code, lang_code.upper())
211
+
212
+ def _calculate_confidence(self, whisper_result: Dict) -> float:
213
+ """
214
+ Calculate confidence score from Whisper result.
215
+
216
+ Whisper doesn't directly provide confidence, so we estimate it
217
+ based on available metrics.
218
+ """
219
+ # If segments are available, average their probabilities
220
+ if "segments" in whisper_result and whisper_result["segments"]:
221
+ avg_logprob = sum(s.get("avg_logprob", -1.0) for s in whisper_result["segments"])
222
+ avg_logprob /= len(whisper_result["segments"])
223
+ # Convert log probability to approximate confidence (0-1)
224
+ # logprob ranges from -inf to 0, typically -2 to 0 for good transcriptions
225
+ confidence = max(0.0, min(1.0, (avg_logprob + 2.0) / 2.0))
226
+ return round(confidence, 2)
227
+
228
+ # Default confidence
229
+ return 0.85
230
+
231
+ def save_uploaded_audio(self, audio_bytes: bytes, filename: str) -> str:
232
+ """
233
+ Save uploaded audio file to temporary location.
234
+
235
+ Args:
236
+ audio_bytes: Audio file bytes
237
+ filename: Original filename
238
+
239
+ Returns:
240
+ Path to saved file
241
+ """
242
+ # Create temp directory if it doesn't exist
243
+ temp_dir = Path(tempfile.gettempdir()) / "voice_assistant_uploads"
244
+ temp_dir.mkdir(exist_ok=True)
245
+
246
+ # Save file
247
+ file_extension = Path(filename).suffix
248
+ temp_file = temp_dir / f"upload_{os.urandom(8).hex()}{file_extension}"
249
+
250
+ temp_file.write_bytes(audio_bytes)
251
+ logger.info(f"Saved uploaded audio to: {temp_file}")
252
+
253
+ return str(temp_file)
254
+
255
+ def save_audio_array(self, audio_data, sample_rate: int) -> str:
256
+ """
257
+ Save audio numpy array to temporary WAV file (for Gradio integration).
258
+
259
+ Args:
260
+ audio_data: Audio data as numpy array
261
+ sample_rate: Sample rate of the audio
262
+
263
+ Returns:
264
+ Path to saved WAV file
265
+ """
266
+ import numpy as np
267
+ import scipy.io.wavfile as wavfile
268
+
269
+ # Create temp directory if it doesn't exist
270
+ temp_dir = Path(tempfile.gettempdir()) / "voice_assistant_uploads"
271
+ temp_dir.mkdir(exist_ok=True)
272
+
273
+ # Save as WAV file
274
+ temp_file = temp_dir / f"gradio_{os.urandom(8).hex()}.wav"
275
+
276
+ # Ensure audio_data is in the correct format
277
+ if isinstance(audio_data, np.ndarray):
278
+ # Normalize to int16 if needed
279
+ if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
280
+ audio_data = (audio_data * 32767).astype(np.int16)
281
+
282
+ wavfile.write(str(temp_file), sample_rate, audio_data)
283
+ logger.info(f"Saved Gradio audio to: {temp_file}")
284
+
285
+ return str(temp_file)
286
+
287
+ def cleanup_temp_file(self, file_path: str):
288
+ """Delete temporary audio file."""
289
+ try:
290
+ if os.path.exists(file_path):
291
+ os.remove(file_path)
292
+ logger.info(f"Cleaned up temp file: {file_path}")
293
+ except Exception as e:
294
+ logger.warning(f"Failed to cleanup temp file {file_path}: {e}")