Spaces:
Runtime error
Runtime error
| import re | |
| import spacy | |
| from typing import List, Dict, Tuple, Optional, Union | |
| import pickle | |
| from pathlib import Path | |
| import os | |
| # --- Define/Import Pipeline Type FIRST --- | |
| try: | |
| from sklearn.pipeline import Pipeline | |
| except ImportError: | |
| Pipeline = object # type: ignore | |
| # --- Import from models.py --- | |
| try: | |
| from models import predict_category # Keep this import | |
| print("Successfully imported predict_category from models.py") | |
| except ImportError as e: | |
| print(f"ERROR in utils.py: Could not import predict_category from models.py. Details: {e}") | |
| def predict_category(text, pipeline): return "Classification failed" | |
| # --- Model Loading --- | |
| MODEL_DIR = Path("saved_models") | |
| MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl" | |
| NLP_MODEL: Optional[spacy.language.Language] = None | |
| MODEL_PIPELINE: Optional[Pipeline] = None # Now Pipeline is defined | |
| def load_spacy_model() -> Optional[spacy.language.Language]: | |
| """Loads the spaCy model.""" | |
| global NLP_MODEL | |
| if NLP_MODEL is None: | |
| try: | |
| NLP_MODEL = spacy.load("en_core_web_sm") | |
| print("spaCy model 'en_core_web_sm' loaded successfully.") | |
| except OSError: | |
| print("Error loading spaCy model 'en_core_web_sm'. Make sure it's downloaded.") | |
| # Attempt to download if not found (might fail in restricted envs) | |
| try: | |
| print("Attempting to download spaCy model...") | |
| spacy.cli.download("en_core_web_sm") | |
| NLP_MODEL = spacy.load("en_core_web_sm") | |
| print("spaCy model 'en_core_web_sm' downloaded and loaded successfully.") | |
| except Exception as download_e: | |
| print(f"Failed to download or load spaCy model: {download_e}") | |
| NLP_MODEL = None # Ensure it remains None if loading fails | |
| return NLP_MODEL | |
| def load_model_pipeline() -> Optional[Pipeline]: # Now Pipeline is defined | |
| """Loads the classification pipeline from the .pkl file.""" | |
| global MODEL_PIPELINE | |
| if MODEL_PIPELINE is None: | |
| if not MODEL_PATH.exists(): | |
| print(f"Model pipeline not found at {MODEL_PATH}. Please train and save the model pipeline first.") | |
| return None | |
| try: | |
| with open(MODEL_PATH, "rb") as f: | |
| MODEL_PIPELINE = pickle.load(f) | |
| print("Model pipeline loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading model pipeline from {MODEL_PATH}: {e}") | |
| MODEL_PIPELINE = None # Ensure it remains None if loading fails | |
| return MODEL_PIPELINE | |
| # --- PII Detection Regex Patterns --- | |
| # Define regex patterns for PII entities not easily caught by NER | |
| # (Refine these patterns carefully for accuracy) | |
| REGEX_PATTERNS = { | |
| "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', | |
| "phone_number": r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b', | |
| "credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b', # Basic pattern, needs refinement | |
| "cvv_no": r'\b\d{3,4}\b', # Often needs context to differentiate | |
| "expiry_no": r'\b(0[1-9]|1[0-2])\/?([0-9]{4}|[0-9]{2})\b', # MM/YY or MM/YYYY | |
| "aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b', | |
| # DOB might be harder with regex alone, consider context or NER patterns | |
| "dob": r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b' # Basic DOB patterns | |
| } | |
| # --- PII Masking Function (Defined within utils.py) --- | |
| def mask_pii(text: str, nlp: spacy.language.Language) -> Tuple[str, List[Dict]]: | |
| """ | |
| Detects and masks PII in the input text using spaCy NER and Regex. | |
| Args: | |
| text: The input email body string. | |
| nlp: The spaCy language model. | |
| Returns: | |
| A tuple containing: | |
| - masked_email (str): The email body with PII replaced by placeholders. | |
| - list_of_masked_entities (List[Dict]): A list of dictionaries, | |
| each detailing a masked entity (position, classification, original value). | |
| """ | |
| masked_text = text | |
| list_of_masked_entities = [] | |
| found_spans = [] # To store (start, end, entity_type, original_value) | |
| # 1. Use spaCy for Named Entity Recognition (PERSON for full_name) | |
| doc = nlp(text) | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| # Simple PERSON check, might need refinement (e.g., filter short names) | |
| if len(ent.text.split()) > 1: # Basic check for multi-word names | |
| found_spans.append((ent.start_char, ent.end_char, "full_name", ent.text)) | |
| # 2. Use Regex for other PII types | |
| for entity_type, pattern in REGEX_PATTERNS.items(): | |
| for match in re.finditer(pattern, text): | |
| # Basic check for overlap with already found spans (can be improved) | |
| is_overlapping = any( | |
| max(found[0], match.start()) < min(found[1], match.end()) | |
| for found in found_spans | |
| ) | |
| if not is_overlapping: | |
| # Add basic context checks if needed (e.g., for CVV) | |
| # if entity_type == "cvv_no" and not is_likely_cvv(text, match): continue | |
| found_spans.append((match.start(), match.end(), entity_type, match.group(0))) | |
| # 3. Sort spans by start position to handle masking correctly | |
| found_spans.sort(key=lambda x: x[0]) | |
| # 4. Perform masking and create the entity list | |
| offset = 0 # Keep track of index changes due to replacements | |
| for start, end, entity_type, original_value in found_spans: | |
| adjusted_start = start + offset | |
| adjusted_end = end + offset | |
| placeholder = f"[{entity_type}]" | |
| # Replace the PII with the placeholder in the masked_text | |
| masked_text = masked_text[:adjusted_start] + placeholder + masked_text[adjusted_end:] | |
| # Update the offset for subsequent replacements | |
| offset += len(placeholder) - (end - start) | |
| # Add details to the list_of_masked_entities | |
| list_of_masked_entities.append({ | |
| "position": [start, end], # Use ORIGINAL indices | |
| "classification": entity_type, | |
| "entity": original_value | |
| }) | |
| # Sort the final list by original start position for consistency | |
| list_of_masked_entities.sort(key=lambda x: x["position"][0]) | |
| return masked_text, list_of_masked_entities | |
| # --- Main Processing Function (Defined within utils.py) --- | |
| def process_email_request(email_body: str) -> dict: | |
| """ | |
| Processes the input email body for PII masking and classification. | |
| Loads models on first call if not already loaded. | |
| """ | |
| print("Processing email request...") # Add log | |
| nlp = load_spacy_model() | |
| pipeline = load_model_pipeline() | |
| if nlp is None: | |
| return {"error": "spaCy model not loaded.", "input_email_body": email_body} | |
| if pipeline is None: | |
| return {"error": "Classification pipeline not loaded.", "input_email_body": email_body} | |
| try: | |
| # 1. Mask PII using the loaded spaCy model | |
| # Ensure mask_pii expects the nlp model as an argument if needed | |
| masked_email_body, entities = mask_pii(email_body, nlp) # Pass nlp model | |
| print(f"PII Masking complete. Found {len(entities)} entities.") # Add log | |
| # Convert entities to the required dict format if necessary | |
| # Assuming mask_pii already returns entities as list of dicts | |
| # with 'position', 'classification', 'entity' keys. | |
| # 2. Classify the masked email using the loaded pipeline | |
| predicted_class = predict_category(masked_email_body, pipeline) | |
| print(f"Classification complete. Predicted class: {predicted_class}") # Add log | |
| # 3. Construct the response dictionary | |
| response = { | |
| "input_email_body": email_body, | |
| "list_of_masked_entities": entities, # Ensure this matches expected format | |
| "masked_email": masked_email_body, | |
| "category_of_the_email": predicted_class | |
| } | |
| print("Response constructed successfully.") # Add log | |
| return response | |
| except Exception as e: | |
| print(f"Error during email processing: {e}") # Log the specific error | |
| # Consider logging the full traceback for debugging | |
| # import traceback | |
| # print(traceback.format_exc()) | |
| return { | |
| "error": f"An error occurred during processing: {str(e)}", | |
| "input_email_body": email_body | |
| } | |