import os import re from typing import Dict, List, Optional, Tuple import PyPDF2 import docx2txt from PIL import Image import pytesseract import io class ResumeScanner: """Simple resume text extractor - no complex analysis needed for vector search""" def __init__(self): pass def extract_text_from_file(self, file_content: bytes, filename: str) -> str: """Extract text from various file formats.""" file_ext = filename.lower().split('.')[-1] try: if file_ext == 'pdf': return self._extract_from_pdf(file_content) elif file_ext in ['doc', 'docx']: return self._extract_from_docx(file_content) elif file_ext in ['txt']: return file_content.decode('utf-8') elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']: return self._extract_from_image(file_content) else: raise ValueError(f"Unsupported file format: {file_ext}") except Exception as e: print(f"❌ Error extracting text from {filename}: {e}") return "" def _extract_from_pdf(self, file_content: bytes) -> str: """Extract text from PDF file.""" try: pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: print(f"❌ Error reading PDF: {e}") return "" def _extract_from_docx(self, file_content: bytes) -> str: """Extract text from DOCX file.""" try: return docx2txt.process(io.BytesIO(file_content)) except Exception as e: print(f"❌ Error reading DOCX: {e}") return "" def _extract_from_image(self, file_content: bytes) -> str: """Extract text from image using OCR.""" try: image = Image.open(io.BytesIO(file_content)) # Use OCR to extract text text = pytesseract.image_to_string(image) return text except Exception as e: print(f"❌ Error reading image with OCR: {e}") return "" def clean_extracted_text(self, text: str) -> str: """Clean and optimize extracted text for better vector search.""" if not text: return "" # Remove excessive whitespace and newlines text = re.sub(r'\n+', ' ', text) text = re.sub(r'\s+', ' ', text) # Remove special characters that might interfere with search text = re.sub(r'[^\w\s.,@-]', ' ', text) # Trim and return return text.strip() # Global instance resume_scanner = ResumeScanner()