import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report, confusion_matrix
print("Defaults Loaded")
loans = pd.read_csv('./data/loans_2007.csv', low_memory=False)
display(loans.iloc[:3,:13])
print("Number of Columns: {:d}".format(len(loans.columns)))
target column
print(loans['loan_status'].value_counts())
only fully paid and charged off characterize loans with a final outcome (the others are ongoing)
sel = (loans['loan_status']== 'Fully Paid') | (loans['loan_status']== 'Charged Off')
loans = loans[sel]
mapping_dict = {
"loan_status": {
"Charged Off": 0,
"Fully Paid": 1
}
}
loans = loans.replace(mapping_dict)
print(loans['loan_status'].value_counts())
class inbalance in the target column, keep in mind
clean features
cols_to_drop = ['id', 'member_id', 'funded_amnt', 'funded_amnt_inv', 'grade', 'sub_grade', 'emp_title',
'issue_d', 'zip_code', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt']
loans.drop(cols_to_drop, axis=1, inplace=True)
print(len(loans.columns))
#drop columns with only one unique value (after removing nan)
drop_columns = []
for element in loans.columns:
if(len(loans[element].dropna().unique())==1):
drop_columns.append(element)
loans.drop(drop_columns, axis=1, inplace=True)
print(len(loans.columns))
#remove columns with more than 1% missing values (keep employment length as it is likely a good predictor)
null_counts = loans.isnull().sum()
#print(null_counts[null_counts>len(loans)*0.01])
print(null_counts[null_counts>0.])
loans.drop('pub_rec_bankruptcies', axis=1, inplace=True)
#drop rows with missing values
loans.dropna(axis=0, inplace=True)
null_counts = loans.isnull().sum()
#print(null_counts[null_counts>0])
#write file
loans.to_csv('./data/filtered_loans_2007.csv', index=False)
identify numerical needing conversion and extraneous columns
loans = pd.read_csv('./data/filtered_loans_2007.csv')
object_columns_df = loans.select_dtypes(include=['object'])
display(object_columns_df[:1])
#convert to numeric:
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype(float)
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype(float)
#remove extraneous
cols_to_drop = ['last_credit_pull_d', 'earliest_cr_line']
loans.drop(cols_to_drop, axis=1, inplace=True)
dummy coding:
#cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state', 'purpose','title']
#for element in cols:
# print(len(object_columns_df[element].value_counts()))
# print(object_columns_df[element].value_counts()[:5])
# print('\n')
#drop 'addr_state' and 'title'
cols_to_drop = ['addr_state', 'title']
loans.drop(cols_to_drop, axis=1, inplace=True)
#map emp_length
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loans = loans.replace(mapping_dict)
#dummy code 'home_ownership', 'verification_status', 'purpose' and 'term'
cols = ['home_ownership', 'verification_status', 'purpose', 'term']
for element in cols:
loans[element] = loans[element].astype('category')
dummy_df = pd.get_dummies(loans[cols])
loans = pd.concat([loans, dummy_df], axis=1)
loans.drop(cols, axis=1, inplace=True)
print(len(loans.columns))
#Shuffle DF
np.random.seed(1)
loans = loans.iloc[np.random.permutation(len(loans))]
#write file
loans.to_csv('./data/cleaned_loans_2007.csv', index=False)
load cleaned data:
loans = pd.read_csv('./data/cleaned_loans_2007.csv')
feature_columns = loans.drop('loan_status', axis=1).columns.tolist()
target = 'loan_status'
def select_features(df, target, model):
#select numeric and drop NaNs
df_new = df.select_dtypes([np.number]).dropna(axis=1)
#drop survived and ID
all_X = df_new.drop(target,axis=1)
all_y = df_new[target]
#cv is the number of folds
selector = RFECV(model, cv=10)
selector.fit(all_X, all_y)
optimized_columns = list(all_X.columns[selector.support_])
print("Best Columns \n"+"-"*12+"\n{}\n".format(optimized_columns))
return optimized_columns
model = RandomForestClassifier(n_estimators=50, random_state=1, min_samples_leaf=5, class_weight='balanced')
optimized_columns_RFC = select_features(loans[:int(len(loans)/5.)], target, model)
model = LogisticRegression(solver = 'lbfgs', class_weight='balanced', max_iter=3000)
optimized_columns_LR = select_features(loans[:int(len(loans)/5.)], target, model)
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
def select_model(df, features_list, target, models_to_fit):
dicts= [ {
"name": "LogisticRegression",
"estimator": LogisticRegression(max_iter = 5000),
"hyperparameters":
{
"solver": ["newton-cg", "lbfgs", "liblinear"],
"class_weight": ["balanced", ""]
}
},
{
"name": "KNeighborsClassifier",
"estimator": KNeighborsClassifier(),
"hyperparameters":
{
"n_neighbors": range(1,20,2),
"weights": ["distance", "uniform"],
"algorithm": ["ball_tree", "kd_tree", "brute"],
"p": [1,2]
}
},
{
"name": "RandomForestClassifier",
"estimator": RandomForestClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5],
"class_weight": [None, "balanced", {0: 3, 1: 1}, {0: 5, 1: 1}]
}
} ]
scoring = {'ROC_AUC':'roc_auc', 'Accuracy':'accuracy',
'precision_1': make_scorer(precision_score, pos_label=1),
'recall_1': make_scorer(recall_score, pos_label=1),
'f1_1': make_scorer(f1_score, pos_label=1),
'precision_0': make_scorer(precision_score, pos_label=0),
'recall_0': make_scorer(recall_score, pos_label=0),
'f1_0': make_scorer(f1_score, pos_label=0)}
all_y = df[target]
for element in dicts:
if(element['name'] not in models_to_fit):
continue
print(element['name'])
print('-'*len(element['name']))
all_X = df[features_list[element['name']]]
model = element['estimator']
grid = GridSearchCV(model, element['hyperparameters'], cv=10, scoring=scoring, refit='ROC_AUC', iid=True)
grid.fit(all_X, all_y)
element['best_params'] = grid.best_params_
element['best_score'] = grid.best_score_
element['best_estimator'] = grid.best_estimator_
for scorer in scoring:
print(f"{scorer}: {max(grid.cv_results_['mean_test_'+scorer]):0.3f}")
print("Best Parameters: {}".format(grid.best_params_))
print("Best Score: {:0.3f}\n\n".format(grid.best_score_))
#for scorer in scoring:
# print(cv_results_'_<scorer_name>')
return dicts
models_to_fit = ['LogisticRegression','KNeighborsClassifier','RandomForestClassifier']
optimized_columns = {'LogisticRegression': optimized_columns_LR,
'KNeighborsClassifier': optimized_columns_LR,
'RandomForestClassifier': optimized_columns_RFC}
model_dicts = select_model(loans[:int(len(loans)/10.)], optimized_columns, target, models_to_fit)
print("model selection finished")
kf = KFold(10, shuffle=True, random_state=1)
best_model = {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2',
'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 20}
model = RandomForestClassifier()
model.set_params(**best_model)
predictions = cross_val_predict(model, loans[optimized_columns_RFC], loans[target], cv=kf, )
predictions = pd.Series(predictions)
#classification report
print(classification_report(loans[target],predictions))
c_matrix = confusion_matrix(loans[target],predictions)
print(c_matrix)
tp = c_matrix[0][0]
fp = c_matrix[0][1]
fn = c_matrix[1][0]
tn = c_matrix[1][1]
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
print("TPR:{:0.3f}, FPR:{:0.3f}".format(tpr, fpr))
loans = pd.read_csv('./data/cleaned_loans_2007.csv')
features_df = loans.drop('loan_status', axis=1)
target = 'loan_status'
lr = LogisticRegression(solver='lbfgs', max_iter=30000)
kf = KFold(10, random_state=1)
predictions = cross_val_predict(lr, features_df, loans[target], cv=kf)
predictions = pd.Series(predictions)
tp = len(predictions[(loans[target]==1) & (predictions==1)])
tn = len(predictions[(loans[target]==0) & (predictions==0)])
fp = len(predictions[(loans[target]==0) & (predictions==1)])
fn = len(predictions[(loans[target]==1) & (predictions==0)])
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
print("TPR:{:0.3f}, FPR:{:0.3f}".format(tpr, fpr))
correct for inbalance by penalizing misclassifications of the less prevalent class more than the other class:
lr = LogisticRegression(solver='lbfgs', class_weight='balanced', max_iter=3000)
#cross validation accross all the rows of the training data (kfold=n, leave one out validation)
#kf = KFold(features.shape[0], random_state=1)
kf = KFold(10, random_state=1)
predictions = cross_val_predict(lr, features_df, loans[target], cv=kf, )
predictions = pd.Series(predictions)
tp = len(predictions[(loans[target]==1) & (predictions==1)])
tn = len(predictions[(loans[target]==0) & (predictions==0)])
fp = len(predictions[(loans[target]==0) & (predictions==1)])
fn = len(predictions[(loans[target]==1) & (predictions==0)])
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
print("TPR:{:0.3f}, FPR:{:0.3f}".format(tpr, fpr))
setting class_wight to balanced assign a penalty of ~5.89 for misclassifying 0 (there as 5.89 times more 1's than 0's)
penalty = {0: 10, 1: 1}
lr = LogisticRegression(solver = 'lbfgs', class_weight=penalty, max_iter=3000)
#kf = KFold(features.shape[0], random_state=1)
kf = KFold(10, random_state=1)
predictions = cross_val_predict(lr, features_df, loans[target], cv=kf, )
predictions = pd.Series(predictions)
tp = len(predictions[(loans[target]==1) & (predictions==1)])
tn = len(predictions[(loans[target]==0) & (predictions==0)])
fp = len(predictions[(loans[target]==0) & (predictions==1)])
fn = len(predictions[(loans[target]==1) & (predictions==0)])
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
print("TPR:{:0.3f}, FPR:{:0.3f}".format(tpr, fpr))
lower false positive rate at the expense of true positives