import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import myML_functions as myML_functions
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score
from sklearn.metrics import homogeneity_score, silhouette_score
df = pd.read_csv('./data/Fraud/chapter_3/banksim.csv')
print(df.groupby('category').mean())
leisure and travel related transactions have high rates of fraud
df = pd.read_csv('./data/Fraud/chapter_3/banksim_adj.csv')
print(df[['age','amount','fraud']].groupby('age').mean())
no particular trend with age
df_fraud = df.loc[df['fraud'] == 1]
df_non_fraud = df.loc[df['fraud'] == 0]
# Plot histograms of the amounts in fraud and non-fraud data
plt.hist(df_fraud['amount'], alpha=0.5, label='fraud')
plt.hist(df_non_fraud['amount'], alpha=0.5, label='nonfraud')
plt.legend()
plt.show()
fraud transactions tend to be for large amounts
def print_model_result(y_true, y_predicted):
print(f"R2: {r2_score(y_true, y_predicted):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y_true, y_predicted):0.3f}\n")
print('Classifcation report:\n', classification_report(y_true, y_predicted))
#conf_mat = confusion_matrix(y_test, predicted)
#print('Confusion matrix:\n', conf_mat)
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y_true, y_predicted, classes=class_names)
plt.show()
MinMaxScaler
df = pd.read_csv('./data/Fraud/chapter_3/banksim_adj.csv')
y = df['fraud']
df.drop('fraud', axis=1)
X = np.array(df).astype(np.float)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
clustno = range(1, 10)
kmeans = [MiniBatchKMeans(n_clusters=i) for i in clustno]
score = [kmeans[i].fit(X_scaled).score(X_scaled) for i in range(len(kmeans))]
plt.plot(clustno, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
use 4 clusters
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=0)
kmeans = MiniBatchKMeans(n_clusters=4, random_state=42).fit(X_train)
# Obtain predictions and calculate distance from cluster centroid
X_test_clusters = kmeans.predict(X_test)
X_test_clusters_centers = kmeans.cluster_centers_
dist = [np.linalg.norm(x-y) for x, y in zip(X_test, X_test_clusters_centers[X_test_clusters])]
# Create fraud predictions based on outliers on clusters
km_y_pred = np.array(dist)
km_y_pred[dist >= np.percentile(dist, 95)] = 1
km_y_pred[dist < np.percentile(dist, 95)] = 0
print_model_result(y_test, km_y_pred)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=0)
db = DBSCAN(eps=0.9, min_samples=10, n_jobs=-1).fit(X_train)
pred_labels = db.labels_
n_clusters = len(set(pred_labels)) - (1 if -1 in y else 0)
print('Estimated number of clusters: %d' % n_clusters)
print("Homogeneity: %0.3f" % homogeneity_score(y_train, pred_labels))
print("Silhouette Coefficient: %0.3f" % silhouette_score(X_train, pred_labels))
def dbscan_predict(model, X):
nr_samples = X.shape[0]
y_new = np.ones(shape=nr_samples, dtype=int) * -1
for i in range(nr_samples):
diff = model.components_ - X[i, :] # NumPy broadcasting
dist = np.linalg.norm(diff, axis=1) # Euclidean distance
shortest_dist_idx = np.argmin(dist)
if dist[shortest_dist_idx] < model.eps:
y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]]
return y_new
Predict new labels on test dataset
pred_labels = dbscan_predict(db, X_test)
n_clusters = len(set(pred_labels)) - (1 if -1 in y else 0)
print('Estimated number of clusters: %d' % n_clusters)
print("Homogeneity: %0.3f" % homogeneity_score(y_test, pred_labels))
print("Silhouette Coefficient: %0.3f" % silhouette_score(X_test, pred_labels))
find the smallest clusters to identify as fraud
counts = np.bincount(pred_labels[pred_labels>=0])
print(counts)
# Sort the sample counts of the clusters and take the smallest clusters
smallest_clusters = np.argsort(counts)[:10]
print("The smallest clusters are clusters:")
print(smallest_clusters)
print("Their counts are:")
print(counts[smallest_clusters])
test fraud detection accuracy with DBSCAN
db_df = pd.DataFrame({'clusternr':pred_labels, 'fraud':y_test})
db_df['predicted_fraud'] = 0
for cluster in smallest_clusters:
db_df['predicted_fraud'].loc[db_df['clusternr']==cluster] = 1
#db_df['predicted_fraud'] = np.where((db_df['clusternr']==21) | (db_df['clusternr']==9),1 , 0)
print_model_result(db_df['fraud'], db_df['predicted_fraud'])