Spaces:
Build error
Build error
File size: 13,641 Bytes
c7281d3 2cde837 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | import gradio as gr
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
import os
# Global variables to store the model and data
model = None
feature_columns = None
def load_and_train_model(csv_file):
"""Load dataset and train a Random Forest model"""
global model, feature_columns
try:
# Read the uploaded CSV
df = pd.read_csv(csv_file.name)
# Check if 'fraud' column exists
if 'fraud' not in df.columns:
return "โ Error: CSV must contain a 'fraud' column as the target variable."
# Separate features and target
X = df.drop(['fraud', 'transaction_id'], axis=1, errors='ignore')
y = df['fraud']
feature_columns = X.columns.tolist()
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
# Format results
results = f"""
โ
**Model Trained Successfully!**
๐ **Dataset Information:**
- Total Samples: {len(df)}
- Training Samples: {len(X_train)}
- Test Samples: {len(X_test)}
- Fraud Cases: {y.sum()} ({y.mean()*100:.1f}%)
- Legitimate Cases: {(y==0).sum()} ({(y==0).mean()*100:.1f}%)
๐ **Model Performance:**
- **Accuracy:** {accuracy*100:.2f}%
- **Precision:** {precision*100:.2f}%
- **Recall:** {recall*100:.2f}%
- **F1-Score:** {f1*100:.2f}%
๐ข **Confusion Matrix:**
```
Predicted
Fraud Legitimate
Actual Fraud {cm[1][1]} {cm[1][0]}
Legit {cm[0][1]} {cm[0][0]}
```
**Key Metrics Explained:**
- **True Positives (TP):** {cm[1][1]} frauds correctly detected
- **False Negatives (FN):** {cm[1][0]} frauds missed (โ ๏ธ costly!)
- **False Positives (FP):** {cm[0][1]} false alarms
- **True Negatives (TN):** {cm[0][0]} legitimate transactions correctly identified
โ
Model is ready! You can now make predictions below.
"""
return results
except Exception as e:
return f"โ Error: {str(e)}"
def predict_single_transaction(amount, hour, dist_home, dist_last, ratio_median,
repeat_retailer, used_chip, used_pin, online_order):
"""Make a prediction for a single transaction"""
global model, feature_columns
if model is None:
return "โ ๏ธ Please upload and train a model first!", ""
try:
# Create input dataframe
input_data = pd.DataFrame({
'transaction_amount': [amount],
'transaction_hour': [hour],
'distance_from_home_km': [dist_home],
'distance_from_last_transaction_km': [dist_last],
'ratio_to_median_purchase': [ratio_median],
'repeat_retailer': [repeat_retailer],
'used_chip': [used_chip],
'used_pin': [used_pin],
'online_order': [online_order]
})
# Make prediction
prediction = model.predict(input_data)[0]
probability = model.predict_proba(input_data)[0]
# Format result
fraud_prob = probability[1] * 100
legit_prob = probability[0] * 100
if prediction == 1:
result = f"๐จ **FRAUD DETECTED**"
confidence = fraud_prob
color = "red"
else:
result = f"โ
**LEGITIMATE TRANSACTION**"
confidence = legit_prob
color = "green"
details = f"""
{result}
**Confidence:** {confidence:.1f}%
**Probability Distribution:**
- Fraud: {fraud_prob:.1f}%
- Legitimate: {legit_prob:.1f}%
**Risk Level:** {'๐ด HIGH' if fraud_prob > 70 else '๐ก MEDIUM' if fraud_prob > 40 else '๐ข LOW'}
**Transaction Details:**
- Amount: ${amount:,.2f}
- Time: {hour}:00
- Distance from home: {dist_home:.1f} km
- Distance from last transaction: {dist_last:.1f} km
- Ratio to median: {ratio_median:.2f}x
- Repeat retailer: {'Yes' if repeat_retailer else 'No'}
- Used chip: {'Yes' if used_chip else 'No'}
- Used PIN: {'Yes' if used_pin else 'No'}
- Online order: {'Yes' if online_order else 'No'}
"""
return details, result
except Exception as e:
return f"โ Error: {str(e)}", ""
def predict_batch(csv_file):
"""Make predictions for batch of transactions"""
global model, feature_columns
if model is None:
return None, "โ ๏ธ Please upload and train a model first!"
try:
# Read CSV
df = pd.read_csv(csv_file.name)
# Keep original df for output
original_df = df.copy()
# Prepare features
X = df.drop(['fraud', 'transaction_id'], axis=1, errors='ignore')
# Make predictions
predictions = model.predict(X)
probabilities = model.predict_proba(X)
# Add predictions to dataframe
original_df['predicted_fraud'] = predictions
original_df['fraud_probability'] = probabilities[:, 1] * 100
original_df['confidence'] = np.max(probabilities, axis=1) * 100
# Calculate metrics if 'fraud' column exists
if 'fraud' in original_df.columns:
accuracy = accuracy_score(original_df['fraud'], predictions)
precision = precision_score(original_df['fraud'], predictions)
recall = recall_score(original_df['fraud'], predictions)
f1 = f1_score(original_df['fraud'], predictions)
metrics = f"""
๐ **Batch Prediction Results:**
- Total Transactions: {len(df)}
- Predicted Fraud: {predictions.sum()} ({predictions.mean()*100:.1f}%)
- Predicted Legitimate: {(predictions==0).sum()} ({(predictions==0).mean()*100:.1f}%)
๐ **Performance Metrics:**
- Accuracy: {accuracy*100:.2f}%
- Precision: {precision*100:.2f}%
- Recall: {recall*100:.2f}%
- F1-Score: {f1*100:.2f}%
โ
Results are ready for download!
"""
else:
metrics = f"""
๐ **Batch Prediction Results:**
- Total Transactions: {len(df)}
- Predicted Fraud: {predictions.sum()} ({predictions.mean()*100:.1f}%)
- Predicted Legitimate: {(predictions==0).sum()} ({(predictions==0).mean()*100:.1f}%)
โ
Results are ready for download!
"""
# Save results to temporary CSV
output_file = "predictions_output.csv"
original_df.to_csv(output_file, index=False)
return output_file, metrics
except Exception as e:
return None, f"โ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Fraud Detection System") as demo:
gr.Markdown("""
# ๐ณ Credit Card Fraud Detection System
### AI Infinity Programme | TalentSprint
This interactive demo allows you to train a fraud detection model and make predictions on credit card transactions.
**How to use:**
1. Upload your training dataset (CSV file)
2. Train the model
3. Make single predictions or batch predictions
""")
with gr.Tab("๐ค Upload & Train Model"):
gr.Markdown("### Step 1: Upload Training Dataset")
gr.Markdown("Upload a CSV file containing transaction data with a 'fraud' column (0 = legitimate, 1 = fraud)")
with gr.Row():
with gr.Column():
train_file = gr.File(label="Upload Training CSV", file_types=[".csv"])
train_button = gr.Button("๐ Train Model", variant="primary", size="lg")
with gr.Column():
train_output = gr.Markdown(label="Training Results")
train_button.click(
fn=load_and_train_model,
inputs=[train_file],
outputs=[train_output]
)
gr.Markdown("""
---
**Expected CSV format:**
- `transaction_amount`, `transaction_hour`, `distance_from_home_km`, `distance_from_last_transaction_km`,
- `ratio_to_median_purchase`, `repeat_retailer`, `used_chip`, `used_pin`, `online_order`, `fraud`
""")
with gr.Tab("๐ Single Prediction"):
gr.Markdown("### Test Individual Transactions")
gr.Markdown("Enter transaction details to check if it's fraudulent")
with gr.Row():
with gr.Column():
amount = gr.Number(label="Transaction Amount ($)", value=100)
hour = gr.Slider(0, 23, step=1, label="Transaction Hour (0-23)", value=14)
dist_home = gr.Number(label="Distance from Home (km)", value=10)
dist_last = gr.Number(label="Distance from Last Transaction (km)", value=5)
ratio_median = gr.Number(label="Ratio to Median Purchase", value=1.0)
with gr.Column():
repeat_retailer = gr.Checkbox(label="Repeat Retailer", value=True)
used_chip = gr.Checkbox(label="Used Chip", value=True)
used_pin = gr.Checkbox(label="Used PIN", value=True)
online_order = gr.Checkbox(label="Online Order", value=False)
predict_button = gr.Button("๐ฎ Predict", variant="primary", size="lg")
with gr.Row():
prediction_output = gr.Markdown(label="Prediction Result")
prediction_label = gr.Markdown(label="Quick Result")
predict_button.click(
fn=predict_single_transaction,
inputs=[amount, hour, dist_home, dist_last, ratio_median,
repeat_retailer, used_chip, used_pin, online_order],
outputs=[prediction_output, prediction_label]
)
gr.Markdown("---")
gr.Markdown("### ๐งช Quick Test Scenarios")
with gr.Row():
gr.Markdown("""
**Scenario 1: Obvious Fraud**
- Amount: $4500, Hour: 3, Dist Home: 800km
- New retailer, no chip/PIN, online
""")
gr.Markdown("""
**Scenario 2: Normal Transaction**
- Amount: $45, Hour: 14, Dist Home: 5km
- Repeat retailer, chip + PIN, in-person
""")
gr.Markdown("""
**Scenario 3: Suspicious**
- Amount: $350, Hour: 22, Dist Home: 60km
- New retailer, chip but no PIN, online
""")
with gr.Tab("๐ Batch Predictions"):
gr.Markdown("### Upload Multiple Transactions")
gr.Markdown("Upload a CSV file with multiple transactions to get predictions for all of them")
with gr.Row():
with gr.Column():
batch_file = gr.File(label="Upload Test CSV", file_types=[".csv"])
batch_button = gr.Button("๐ Predict Batch", variant="primary", size="lg")
with gr.Column():
batch_output = gr.Markdown(label="Batch Results")
download_file = gr.File(label="Download Results CSV")
batch_button.click(
fn=predict_batch,
inputs=[batch_file],
outputs=[download_file, batch_output]
)
with gr.Tab("โน๏ธ About"):
gr.Markdown("""
## About This Demo
This fraud detection system uses a **Random Forest Classifier** to identify potentially fraudulent credit card transactions.
### Features Used:
1. **transaction_amount**: Transaction value in dollars
2. **transaction_hour**: Hour of day (0-23)
3. **distance_from_home_km**: Distance from cardholder's home
4. **distance_from_last_transaction_km**: Distance from previous transaction
5. **ratio_to_median_purchase**: Ratio compared to typical spending
6. **repeat_retailer**: Whether customer used this merchant before
7. **used_chip**: Whether chip card was used
8. **used_pin**: Whether PIN was entered
9. **online_order**: Whether transaction was online
### Model Performance:
The model is trained to maximize **recall** (catching frauds) while maintaining reasonable **precision** (avoiding false alarms).
### Important Metrics:
- **Precision**: Of flagged transactions, how many are actually fraud?
- **Recall**: Of all frauds, how many do we catch?
- **F1-Score**: Balance between precision and recall
### Business Impact:
- **False Negative (missed fraud)**: Very costly - customer loses money
- **False Positive (false alarm)**: Moderately costly - customer inconvenience
---
**Created for:** AI Infinity Programme | TalentSprint
**Target Audience:** Software engineers transitioning to AI roles
**Educational Purpose:** Understanding classification, metrics, and business logic
""")
# Launch the app
if __name__ == "__main__":
demo.launch()
|