SIH-ML-Backend-Resume-scanner / resume_scanner.py
Pipalskill's picture
Upload 9 files
7a10db2 verified
raw
history blame
2.82 kB
import os
import re
from typing import Dict, List, Optional, Tuple
import PyPDF2
import docx2txt
from PIL import Image
import pytesseract
import io
class ResumeScanner:
"""Simple resume text extractor - no complex analysis needed for vector search"""
def __init__(self):
pass
def extract_text_from_file(self, file_content: bytes, filename: str) -> str:
"""Extract text from various file formats."""
file_ext = filename.lower().split('.')[-1]
try:
if file_ext == 'pdf':
return self._extract_from_pdf(file_content)
elif file_ext in ['doc', 'docx']:
return self._extract_from_docx(file_content)
elif file_ext in ['txt']:
return file_content.decode('utf-8')
elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']:
return self._extract_from_image(file_content)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
print(f"❌ Error extracting text from {filename}: {e}")
return ""
def _extract_from_pdf(self, file_content: bytes) -> str:
"""Extract text from PDF file."""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"❌ Error reading PDF: {e}")
return ""
def _extract_from_docx(self, file_content: bytes) -> str:
"""Extract text from DOCX file."""
try:
return docx2txt.process(io.BytesIO(file_content))
except Exception as e:
print(f"❌ Error reading DOCX: {e}")
return ""
def _extract_from_image(self, file_content: bytes) -> str:
"""Extract text from image using OCR."""
try:
image = Image.open(io.BytesIO(file_content))
# Use OCR to extract text
text = pytesseract.image_to_string(image)
return text
except Exception as e:
print(f"❌ Error reading image with OCR: {e}")
return ""
def clean_extracted_text(self, text: str) -> str:
"""Clean and optimize extracted text for better vector search."""
if not text:
return ""
# Remove excessive whitespace and newlines
text = re.sub(r'\n+', ' ', text)
text = re.sub(r'\s+', ' ', text)
# Remove special characters that might interfere with search
text = re.sub(r'[^\w\s.,@-]', ' ', text)
# Trim and return
return text.strip()
# Global instance
resume_scanner = ResumeScanner()