import gradio as gr import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix import pickle import os # Global variables to store the model and data model = None feature_columns = None def load_and_train_model(csv_file): """Load dataset and train a Random Forest model""" global model, feature_columns try: # Read the uploaded CSV df = pd.read_csv(csv_file.name) # Check if 'fraud' column exists if 'fraud' not in df.columns: return "โŒ Error: CSV must contain a 'fraud' column as the target variable." # Separate features and target X = df.drop(['fraud', 'transaction_id'], axis=1, errors='ignore') y = df['fraud'] feature_columns = X.columns.tolist() # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Train Random Forest model model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) cm = confusion_matrix(y_test, y_pred) # Format results results = f""" โœ… **Model Trained Successfully!** ๐Ÿ“Š **Dataset Information:** - Total Samples: {len(df)} - Training Samples: {len(X_train)} - Test Samples: {len(X_test)} - Fraud Cases: {y.sum()} ({y.mean()*100:.1f}%) - Legitimate Cases: {(y==0).sum()} ({(y==0).mean()*100:.1f}%) ๐Ÿ“ˆ **Model Performance:** - **Accuracy:** {accuracy*100:.2f}% - **Precision:** {precision*100:.2f}% - **Recall:** {recall*100:.2f}% - **F1-Score:** {f1*100:.2f}% ๐Ÿ”ข **Confusion Matrix:** ``` Predicted Fraud Legitimate Actual Fraud {cm[1][1]} {cm[1][0]} Legit {cm[0][1]} {cm[0][0]} ``` **Key Metrics Explained:** - **True Positives (TP):** {cm[1][1]} frauds correctly detected - **False Negatives (FN):** {cm[1][0]} frauds missed (โš ๏ธ costly!) - **False Positives (FP):** {cm[0][1]} false alarms - **True Negatives (TN):** {cm[0][0]} legitimate transactions correctly identified โœ… Model is ready! You can now make predictions below. """ return results except Exception as e: return f"โŒ Error: {str(e)}" def predict_single_transaction(amount, hour, dist_home, dist_last, ratio_median, repeat_retailer, used_chip, used_pin, online_order): """Make a prediction for a single transaction""" global model, feature_columns if model is None: return "โš ๏ธ Please upload and train a model first!", "" try: # Create input dataframe input_data = pd.DataFrame({ 'transaction_amount': [amount], 'transaction_hour': [hour], 'distance_from_home_km': [dist_home], 'distance_from_last_transaction_km': [dist_last], 'ratio_to_median_purchase': [ratio_median], 'repeat_retailer': [repeat_retailer], 'used_chip': [used_chip], 'used_pin': [used_pin], 'online_order': [online_order] }) # Make prediction prediction = model.predict(input_data)[0] probability = model.predict_proba(input_data)[0] # Format result fraud_prob = probability[1] * 100 legit_prob = probability[0] * 100 if prediction == 1: result = f"๐Ÿšจ **FRAUD DETECTED**" confidence = fraud_prob color = "red" else: result = f"โœ… **LEGITIMATE TRANSACTION**" confidence = legit_prob color = "green" details = f""" {result} **Confidence:** {confidence:.1f}% **Probability Distribution:** - Fraud: {fraud_prob:.1f}% - Legitimate: {legit_prob:.1f}% **Risk Level:** {'๐Ÿ”ด HIGH' if fraud_prob > 70 else '๐ŸŸก MEDIUM' if fraud_prob > 40 else '๐ŸŸข LOW'} **Transaction Details:** - Amount: ${amount:,.2f} - Time: {hour}:00 - Distance from home: {dist_home:.1f} km - Distance from last transaction: {dist_last:.1f} km - Ratio to median: {ratio_median:.2f}x - Repeat retailer: {'Yes' if repeat_retailer else 'No'} - Used chip: {'Yes' if used_chip else 'No'} - Used PIN: {'Yes' if used_pin else 'No'} - Online order: {'Yes' if online_order else 'No'} """ return details, result except Exception as e: return f"โŒ Error: {str(e)}", "" def predict_batch(csv_file): """Make predictions for batch of transactions""" global model, feature_columns if model is None: return None, "โš ๏ธ Please upload and train a model first!" try: # Read CSV df = pd.read_csv(csv_file.name) # Keep original df for output original_df = df.copy() # Prepare features X = df.drop(['fraud', 'transaction_id'], axis=1, errors='ignore') # Make predictions predictions = model.predict(X) probabilities = model.predict_proba(X) # Add predictions to dataframe original_df['predicted_fraud'] = predictions original_df['fraud_probability'] = probabilities[:, 1] * 100 original_df['confidence'] = np.max(probabilities, axis=1) * 100 # Calculate metrics if 'fraud' column exists if 'fraud' in original_df.columns: accuracy = accuracy_score(original_df['fraud'], predictions) precision = precision_score(original_df['fraud'], predictions) recall = recall_score(original_df['fraud'], predictions) f1 = f1_score(original_df['fraud'], predictions) metrics = f""" ๐Ÿ“Š **Batch Prediction Results:** - Total Transactions: {len(df)} - Predicted Fraud: {predictions.sum()} ({predictions.mean()*100:.1f}%) - Predicted Legitimate: {(predictions==0).sum()} ({(predictions==0).mean()*100:.1f}%) ๐Ÿ“ˆ **Performance Metrics:** - Accuracy: {accuracy*100:.2f}% - Precision: {precision*100:.2f}% - Recall: {recall*100:.2f}% - F1-Score: {f1*100:.2f}% โœ… Results are ready for download! """ else: metrics = f""" ๐Ÿ“Š **Batch Prediction Results:** - Total Transactions: {len(df)} - Predicted Fraud: {predictions.sum()} ({predictions.mean()*100:.1f}%) - Predicted Legitimate: {(predictions==0).sum()} ({(predictions==0).mean()*100:.1f}%) โœ… Results are ready for download! """ # Save results to temporary CSV output_file = "predictions_output.csv" original_df.to_csv(output_file, index=False) return output_file, metrics except Exception as e: return None, f"โŒ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Fraud Detection System") as demo: gr.Markdown(""" # ๐Ÿ’ณ Credit Card Fraud Detection System ### AI Infinity Programme | TalentSprint This interactive demo allows you to train a fraud detection model and make predictions on credit card transactions. **How to use:** 1. Upload your training dataset (CSV file) 2. Train the model 3. Make single predictions or batch predictions """) with gr.Tab("๐Ÿ“ค Upload & Train Model"): gr.Markdown("### Step 1: Upload Training Dataset") gr.Markdown("Upload a CSV file containing transaction data with a 'fraud' column (0 = legitimate, 1 = fraud)") with gr.Row(): with gr.Column(): train_file = gr.File(label="Upload Training CSV", file_types=[".csv"]) train_button = gr.Button("๐Ÿš€ Train Model", variant="primary", size="lg") with gr.Column(): train_output = gr.Markdown(label="Training Results") train_button.click( fn=load_and_train_model, inputs=[train_file], outputs=[train_output] ) gr.Markdown(""" --- **Expected CSV format:** - `transaction_amount`, `transaction_hour`, `distance_from_home_km`, `distance_from_last_transaction_km`, - `ratio_to_median_purchase`, `repeat_retailer`, `used_chip`, `used_pin`, `online_order`, `fraud` """) with gr.Tab("๐Ÿ” Single Prediction"): gr.Markdown("### Test Individual Transactions") gr.Markdown("Enter transaction details to check if it's fraudulent") with gr.Row(): with gr.Column(): amount = gr.Number(label="Transaction Amount ($)", value=100) hour = gr.Slider(0, 23, step=1, label="Transaction Hour (0-23)", value=14) dist_home = gr.Number(label="Distance from Home (km)", value=10) dist_last = gr.Number(label="Distance from Last Transaction (km)", value=5) ratio_median = gr.Number(label="Ratio to Median Purchase", value=1.0) with gr.Column(): repeat_retailer = gr.Checkbox(label="Repeat Retailer", value=True) used_chip = gr.Checkbox(label="Used Chip", value=True) used_pin = gr.Checkbox(label="Used PIN", value=True) online_order = gr.Checkbox(label="Online Order", value=False) predict_button = gr.Button("๐Ÿ”ฎ Predict", variant="primary", size="lg") with gr.Row(): prediction_output = gr.Markdown(label="Prediction Result") prediction_label = gr.Markdown(label="Quick Result") predict_button.click( fn=predict_single_transaction, inputs=[amount, hour, dist_home, dist_last, ratio_median, repeat_retailer, used_chip, used_pin, online_order], outputs=[prediction_output, prediction_label] ) gr.Markdown("---") gr.Markdown("### ๐Ÿงช Quick Test Scenarios") with gr.Row(): gr.Markdown(""" **Scenario 1: Obvious Fraud** - Amount: $4500, Hour: 3, Dist Home: 800km - New retailer, no chip/PIN, online """) gr.Markdown(""" **Scenario 2: Normal Transaction** - Amount: $45, Hour: 14, Dist Home: 5km - Repeat retailer, chip + PIN, in-person """) gr.Markdown(""" **Scenario 3: Suspicious** - Amount: $350, Hour: 22, Dist Home: 60km - New retailer, chip but no PIN, online """) with gr.Tab("๐Ÿ“Š Batch Predictions"): gr.Markdown("### Upload Multiple Transactions") gr.Markdown("Upload a CSV file with multiple transactions to get predictions for all of them") with gr.Row(): with gr.Column(): batch_file = gr.File(label="Upload Test CSV", file_types=[".csv"]) batch_button = gr.Button("๐Ÿ“ˆ Predict Batch", variant="primary", size="lg") with gr.Column(): batch_output = gr.Markdown(label="Batch Results") download_file = gr.File(label="Download Results CSV") batch_button.click( fn=predict_batch, inputs=[batch_file], outputs=[download_file, batch_output] ) with gr.Tab("โ„น๏ธ About"): gr.Markdown(""" ## About This Demo This fraud detection system uses a **Random Forest Classifier** to identify potentially fraudulent credit card transactions. ### Features Used: 1. **transaction_amount**: Transaction value in dollars 2. **transaction_hour**: Hour of day (0-23) 3. **distance_from_home_km**: Distance from cardholder's home 4. **distance_from_last_transaction_km**: Distance from previous transaction 5. **ratio_to_median_purchase**: Ratio compared to typical spending 6. **repeat_retailer**: Whether customer used this merchant before 7. **used_chip**: Whether chip card was used 8. **used_pin**: Whether PIN was entered 9. **online_order**: Whether transaction was online ### Model Performance: The model is trained to maximize **recall** (catching frauds) while maintaining reasonable **precision** (avoiding false alarms). ### Important Metrics: - **Precision**: Of flagged transactions, how many are actually fraud? - **Recall**: Of all frauds, how many do we catch? - **F1-Score**: Balance between precision and recall ### Business Impact: - **False Negative (missed fraud)**: Very costly - customer loses money - **False Positive (false alarm)**: Moderately costly - customer inconvenience --- **Created for:** AI Infinity Programme | TalentSprint **Target Audience:** Software engineers transitioning to AI roles **Educational Purpose:** Understanding classification, metrics, and business logic """) # Launch the app if __name__ == "__main__": demo.launch()