Spaces:

Pipalskill
/

SIH-ML-Backend-Resume-scanner

Sleeping

App Files Files Community

SIH-ML-Backend-Resume-scanner / resume_scanner.py

Pipalskill

Upload 9 files

7a10db2 verified 3 months ago

raw

history blame

2.82 kB

	import os
	import re
	from typing import Dict, List, Optional, Tuple
	import PyPDF2
	import docx2txt
	from PIL import Image
	import pytesseract
	import io

	class ResumeScanner:
	"""Simple resume text extractor - no complex analysis needed for vector search"""

	def __init__(self):
	pass

	def extract_text_from_file(self, file_content: bytes, filename: str) -> str:
	"""Extract text from various file formats."""
	file_ext = filename.lower().split('.')[-1]

	try:
	if file_ext == 'pdf':
	return self._extract_from_pdf(file_content)
	elif file_ext in ['doc', 'docx']:
	return self._extract_from_docx(file_content)
	elif file_ext in ['txt']:
	return file_content.decode('utf-8')
	elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']:
	return self._extract_from_image(file_content)
	else:
	raise ValueError(f"Unsupported file format: {file_ext}")
	except Exception as e:
	print(f"❌ Error extracting text from {filename}: {e}")
	return ""

	def _extract_from_pdf(self, file_content: bytes) -> str:
	"""Extract text from PDF file."""
	try:
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	print(f"❌ Error reading PDF: {e}")
	return ""

	def _extract_from_docx(self, file_content: bytes) -> str:
	"""Extract text from DOCX file."""
	try:
	return docx2txt.process(io.BytesIO(file_content))
	except Exception as e:
	print(f"❌ Error reading DOCX: {e}")
	return ""

	def _extract_from_image(self, file_content: bytes) -> str:
	"""Extract text from image using OCR."""
	try:
	image = Image.open(io.BytesIO(file_content))
	# Use OCR to extract text
	text = pytesseract.image_to_string(image)
	return text
	except Exception as e:
	print(f"❌ Error reading image with OCR: {e}")
	return ""

	def clean_extracted_text(self, text: str) -> str:
	"""Clean and optimize extracted text for better vector search."""
	if not text:
	return ""

	# Remove excessive whitespace and newlines
	text = re.sub(r'\n+', ' ', text)
	text = re.sub(r'\s+', ' ', text)

	# Remove special characters that might interfere with search
	text = re.sub(r'[^\w\s.,@-]', ' ', text)

	# Trim and return
	return text.strip()

	# Global instance
	resume_scanner = ResumeScanner()