Spaces:

siddharth786
/

email-pii-classifier-v2

Runtime error

App Files Files Community

email-pii-classifier-v2 / utils.py

siddharth786

Fix circular import and NameError issues

b20f676 11 months ago

raw

history blame contribute delete

8.4 kB

	import re
	import spacy
	from typing import List, Dict, Tuple, Optional, Union
	import pickle
	from pathlib import Path
	import os

	# --- Define/Import Pipeline Type FIRST ---
	try:
	from sklearn.pipeline import Pipeline
	except ImportError:
	Pipeline = object # type: ignore

	# --- Import from models.py ---
	try:
	from models import predict_category # Keep this import
	print("Successfully imported predict_category from models.py")
	except ImportError as e:
	print(f"ERROR in utils.py: Could not import predict_category from models.py. Details: {e}")
	def predict_category(text, pipeline): return "Classification failed"

	# --- Model Loading ---
	MODEL_DIR = Path("saved_models")
	MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
	NLP_MODEL: Optional[spacy.language.Language] = None
	MODEL_PIPELINE: Optional[Pipeline] = None # Now Pipeline is defined

	def load_spacy_model() -> Optional[spacy.language.Language]:
	"""Loads the spaCy model."""
	global NLP_MODEL
	if NLP_MODEL is None:
	try:
	NLP_MODEL = spacy.load("en_core_web_sm")
	print("spaCy model 'en_core_web_sm' loaded successfully.")
	except OSError:
	print("Error loading spaCy model 'en_core_web_sm'. Make sure it's downloaded.")
	# Attempt to download if not found (might fail in restricted envs)
	try:
	print("Attempting to download spaCy model...")
	spacy.cli.download("en_core_web_sm")
	NLP_MODEL = spacy.load("en_core_web_sm")
	print("spaCy model 'en_core_web_sm' downloaded and loaded successfully.")
	except Exception as download_e:
	print(f"Failed to download or load spaCy model: {download_e}")
	NLP_MODEL = None # Ensure it remains None if loading fails
	return NLP_MODEL

	def load_model_pipeline() -> Optional[Pipeline]: # Now Pipeline is defined
	"""Loads the classification pipeline from the .pkl file."""
	global MODEL_PIPELINE
	if MODEL_PIPELINE is None:
	if not MODEL_PATH.exists():
	print(f"Model pipeline not found at {MODEL_PATH}. Please train and save the model pipeline first.")
	return None
	try:
	with open(MODEL_PATH, "rb") as f:
	MODEL_PIPELINE = pickle.load(f)
	print("Model pipeline loaded successfully.")
	except Exception as e:
	print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
	MODEL_PIPELINE = None # Ensure it remains None if loading fails
	return MODEL_PIPELINE

	# --- PII Detection Regex Patterns ---
	# Define regex patterns for PII entities not easily caught by NER
	# (Refine these patterns carefully for accuracy)
	REGEX_PATTERNS = {
	"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b',
	"phone_number": r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
	"credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b', # Basic pattern, needs refinement
	"cvv_no": r'\b\d{3,4}\b', # Often needs context to differentiate
	"expiry_no": r'\b(0[1-9]\|1[0-2])\/?([0-9]{4}\|[0-9]{2})\b', # MM/YY or MM/YYYY
	"aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b',
	# DOB might be harder with regex alone, consider context or NER patterns
	"dob": r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b' # Basic DOB patterns
	}

	# --- PII Masking Function (Defined within utils.py) ---
	def mask_pii(text: str, nlp: spacy.language.Language) -> Tuple[str, List[Dict]]:
	"""
	Detects and masks PII in the input text using spaCy NER and Regex.

	Args:
	text: The input email body string.
	nlp: The spaCy language model.

	Returns:
	A tuple containing:
	- masked_email (str): The email body with PII replaced by placeholders.
	- list_of_masked_entities (List[Dict]): A list of dictionaries,
	each detailing a masked entity (position, classification, original value).
	"""
	masked_text = text
	list_of_masked_entities = []
	found_spans = [] # To store (start, end, entity_type, original_value)

	# 1. Use spaCy for Named Entity Recognition (PERSON for full_name)
	doc = nlp(text)
	for ent in doc.ents:
	if ent.label_ == "PERSON":
	# Simple PERSON check, might need refinement (e.g., filter short names)
	if len(ent.text.split()) > 1: # Basic check for multi-word names
	found_spans.append((ent.start_char, ent.end_char, "full_name", ent.text))

	# 2. Use Regex for other PII types
	for entity_type, pattern in REGEX_PATTERNS.items():
	for match in re.finditer(pattern, text):
	# Basic check for overlap with already found spans (can be improved)
	is_overlapping = any(
	max(found[0], match.start()) < min(found[1], match.end())
	for found in found_spans
	)
	if not is_overlapping:
	# Add basic context checks if needed (e.g., for CVV)
	# if entity_type == "cvv_no" and not is_likely_cvv(text, match): continue
	found_spans.append((match.start(), match.end(), entity_type, match.group(0)))

	# 3. Sort spans by start position to handle masking correctly
	found_spans.sort(key=lambda x: x[0])

	# 4. Perform masking and create the entity list
	offset = 0 # Keep track of index changes due to replacements
	for start, end, entity_type, original_value in found_spans:
	adjusted_start = start + offset
	adjusted_end = end + offset
	placeholder = f"[{entity_type}]"

	# Replace the PII with the placeholder in the masked_text
	masked_text = masked_text[:adjusted_start] + placeholder + masked_text[adjusted_end:]

	# Update the offset for subsequent replacements
	offset += len(placeholder) - (end - start)

	# Add details to the list_of_masked_entities
	list_of_masked_entities.append({
	"position": [start, end], # Use ORIGINAL indices
	"classification": entity_type,
	"entity": original_value
	})

	# Sort the final list by original start position for consistency
	list_of_masked_entities.sort(key=lambda x: x["position"][0])

	return masked_text, list_of_masked_entities

	# --- Main Processing Function (Defined within utils.py) ---
	def process_email_request(email_body: str) -> dict:
	"""
	Processes the input email body for PII masking and classification.
	Loads models on first call if not already loaded.
	"""
	print("Processing email request...") # Add log
	nlp = load_spacy_model()
	pipeline = load_model_pipeline()

	if nlp is None:
	return {"error": "spaCy model not loaded.", "input_email_body": email_body}
	if pipeline is None:
	return {"error": "Classification pipeline not loaded.", "input_email_body": email_body}

	try:
	# 1. Mask PII using the loaded spaCy model
	# Ensure mask_pii expects the nlp model as an argument if needed
	masked_email_body, entities = mask_pii(email_body, nlp) # Pass nlp model
	print(f"PII Masking complete. Found {len(entities)} entities.") # Add log

	# Convert entities to the required dict format if necessary
	# Assuming mask_pii already returns entities as list of dicts
	# with 'position', 'classification', 'entity' keys.

	# 2. Classify the masked email using the loaded pipeline
	predicted_class = predict_category(masked_email_body, pipeline)
	print(f"Classification complete. Predicted class: {predicted_class}") # Add log

	# 3. Construct the response dictionary
	response = {
	"input_email_body": email_body,
	"list_of_masked_entities": entities, # Ensure this matches expected format
	"masked_email": masked_email_body,
	"category_of_the_email": predicted_class
	}
	print("Response constructed successfully.") # Add log
	return response

	except Exception as e:
	print(f"Error during email processing: {e}") # Log the specific error
	# Consider logging the full traceback for debugging
	# import traceback
	# print(traceback.format_exc())
	return {
	"error": f"An error occurred during processing: {str(e)}",
	"input_email_body": email_body
	}