import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from inspect import signature
import myML_functions as myML_functions
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score
df = pd.read_csv("./data/Fraud/chapter_1/creditcard_sampledata_3.csv", index_col=0)
#Count the occurrences of fraud
print(df['Class'].value_counts(normalize=True))
print('')
# Explore the features available in your dataframe
print(df.info())
y = df['Class']
X = df.drop(['Class'], axis=1)
fig, ax = plt.subplots(figsize=(6,5))
ax.set_xlabel('V1'), ax.set_ylabel('V2')
ax.scatter(X.loc[y == 0, 'V1'], X.loc[y == 0, 'V2'], label="Class=0 (Non-Fraud)", alpha=0.5, linewidth=0.15)
ax.scatter(X.loc[y == 1, 'V1'], X.loc[y == 1, 'V2'], label="Class=1 (Fraud)", alpha=0.5, linewidth=0.15, c='r')
ax.legend()
plt.show()
there is a large inbalance in the data, we need re-weighting or re-sampling
def get_model_result(X_train, y_train, X_test, y_test, model):
model.fit(X_train, y_train)
predicted = model.predict(X_test)
probs = model.predict_proba(X_test)
print(f"R2: {r2_score(y_test, predicted):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y_test, probs[:,1]):0.3f}\n")
print('Classifcation report:\n', classification_report(y_test, predicted))
#conf_mat = confusion_matrix(y_test, predicted)
#print('Confusion matrix:\n', conf_mat)
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y_test, predicted, classes=class_names)
plt.show()
print("Mean of V1, V2 and V3 for non-fraud and fraud:")
mean = np.array(df.loc[df['Class']==0,:].groupby('Class').mean())[0]
std = np.array(df.loc[df['Class']==0,:].groupby('Class').std())[0]
display(df.groupby('Class').mean().iloc[:,:9])
print("")
# Implement a rule for stating which cases are flagged as fraud
df['flag_as_fraud'] = 0
sel = True
for ii in range(0, 10):
sel = (df.iloc[:,ii] < (mean[ii] - 2*std[ii])) | (df.iloc[:,ii] > (mean[ii] + 2*std[ii])) & sel
df.loc[sel,'flag_as_fraud'] = 1
#print(pd.crosstab(df.Class, df.flag_as_fraud, rownames=['Actual Fraud'], colnames=['Flagged Fraud']))
class_names = np.array(['Non-Fraud', 'Fraud'])
plot_confusion_matrix(df['Class'], df['flag_as_fraud'], classes=class_names)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#method = RandomOverSampler()
#method = SMOTE(kind='borderline1')
method = SMOTE(kind='borderline2')
#method = SMOTE(kind='regular')
#resampling method needs np.array as input
X_resampled, y_resampled = method.fit_sample(np.array(X_train), np.array(y_train))
fig, ax = plt.subplots(figsize=(6,5))
ax.set_xlabel('V1'), ax.set_ylabel('V2')
ax.scatter(X_resampled[y_resampled == 0, 0], X_resampled[y_resampled == 0, 1],
label="Class=0 (Non-Fraud)", alpha=0.5, linewidth=0.15)
ax.scatter(X_resampled[y_resampled == 1, 0], X_resampled[y_resampled == 1, 1],
label="Class=1 (Fraud)", alpha=0.5, linewidth=0.15, c='r')
ax.legend()
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
method = SMOTE(kind='borderline2')
X_resampled, y_resampled = method.fit_sample(np.array(X_train), np.array(y_train))
#model = LogisticRegression(solver='lbfgs', max_iter=10000)
model = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
get_model_result(X_resampled, y_resampled, X_test, y_test, model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
resampling = SMOTE(kind='borderline2')
#model = LogisticRegression(solver='lbfgs', max_iter=10000)
model = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
pipeline = Pipeline([('SMOTE', resampling), ('Random Forest Classifier', model)])
get_model_result(X_train, y_train, X_test, y_test, pipeline)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = RandomForestClassifier(n_estimators=300, n_jobs=-1, class_weight='balanced_subsample', random_state=5)
get_model_result(X_train, y_train, X_test, y_test, model)
model = RandomForestClassifier(n_estimators=300, bootstrap=True, class_weight={0:1, 1:12}, criterion='entropy',
max_depth=10, min_samples_leaf=10, n_jobs=-1, random_state=5)
get_model_result(X_train, y_train, X_test, y_test, model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
param_grid = {'n_estimators': [1, 30],
'max_features': ['auto', 'log2'],
'max_depth': [4, 8],
'criterion': ['gini', 'entropy']}
model = RandomForestClassifier(random_state=5)
CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='recall')
CV_model.fit(X_train, y_train)
print(CV_model.best_params_)
print(CV_model.best_estimator_)
print(CV_model.best_score_)
model = RandomForestClassifier(class_weight={0:1,1:12}, criterion='gini', n_estimators=30, max_features='log2',
min_samples_leaf=10, max_depth=8, n_jobs=-1, random_state=5)
get_model_result(X_train, y_train, X_test, y_test, model)
kf = KFold(10, shuffle=True, random_state=1)
predictions = cross_val_predict(model, X, y, cv=kf, )
predictions = pd.Series(predictions)
pred_proba = cross_val_predict(model, X, y, cv=kf,method='predict_proba')
print(f"R2: {r2_score(y, predictions):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y, pred_proba[:,1]):0.3f}\n")
print('Classifcation report:\n', classification_report(y, predictions))
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y, predictions, classes=class_names)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf1 = LogisticRegression(class_weight={0:1, 1:15}, solver='lbfgs', max_iter=1000, random_state=1)
clf2 = RandomForestClassifier(class_weight={0:1, 1:12}, criterion='gini', max_depth=8, max_features='log2',
min_samples_leaf=10, n_estimators=100, n_jobs=-1, random_state=5)
clf3 = GaussianNB()
#clf3 = DecisionTreeClassifier(random_state=5, class_weight="balanced")
estimators = [('lr', clf1),('rf', clf2), ('gnb', clf3)]
#ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
ensemble_model = VotingClassifier(estimators=estimators, voting='soft', weights=[1,1,1])
get_model_result(X_train, y_train, X_test, y_test, ensemble_model)
model = ensemble_model
kf = KFold(10, shuffle=True, random_state=1)
predictions = cross_val_predict(model, X, y, cv=kf, )
predictions = pd.Series(predictions)
pred_proba = cross_val_predict(model, X, y, cv=kf,method='predict_proba')
print(f"R2: {r2_score(y, predictions):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y, pred_proba[:,1]):0.3f}\n")
print('Classifcation report:\n', classification_report(y, predictions))
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y, predictions, classes=class_names)
plt.show()