import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
import time
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
import myML_functions as myML_functions
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
print("Defaults Loaded")
# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
income = pd.read_csv("./data/income.csv", index_col=False)
display(income.iloc[:2,[0,1,2,3,4,5,6,7,8,9,10,11,-1]])
# Convert text categories to numbers
cols = ['workclass', 'education', 'marital_status', 'occupation',
'relationship', 'race', 'sex', 'native_country', 'high_income']
for element in cols:
#col = pandas.Categorical.from_array(income[element])
col = pd.Categorical(income[element])
income[element] = col.codes
#remove columns without interesting infomration information
income.drop(['fnlwgt', 'capital_gain', 'capital_loss'], axis=1, inplace=True)
display(income[:3])
#columns to train with (all have been converted to numeric)
all_columns = ["age", "workclass", "education", "education_num", "marital_status", "occupation",
"relationship", "race", "sex", "hours_per_week", "native_country"]
target = 'high_income'
# Shuffle the rows
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))
#create train and test set for holdout validation
train = income[0:int(len(income)*0.8)]
test = income[int(len(income)*0.8):]
# Instantiate the classifier (Set random_state to 1 to be able to replicate)
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[all_columns], train["high_income"])
predictions = clf.predict(test[all_columns])
test_auc = roc_auc_score(test['high_income'], predictions)
predictions = clf.predict(train[all_columns])
train_auc = roc_auc_score(train['high_income'], predictions)
print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))
The predictions on train set are significantly better, hint of overfitting. To avoid overfittin restrict max_depth and min_samples_split
clf = DecisionTreeClassifier(random_state=1, max_depth=7, min_samples_split=13)
clf.fit(train[all_columns], train["high_income"])
predictions = clf.predict(test[all_columns])
test_auc = roc_auc_score(test["high_income"], predictions)
train_predictions = clf.predict(train[all_columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))
Decision Tree Classifier
kf = KFold(10, shuffle=True, random_state=1)
best_model = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto',
'min_samples_leaf': 5, 'min_samples_split': 13, 'random_state':1}
model = DecisionTreeClassifier()
model.set_params(**best_model)
predictions = cross_val_predict(model, income[all_columns], income[target], cv=kf, )
predictions = pd.Series(predictions)
print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], predictions):0.3f}")
Random Forest Classifier
kf = KFold(10, shuffle=True, random_state=1)
best_model = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto',
'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 150, 'n_jobs': -1, 'random_state':1}
model = RandomForestClassifier()
model.set_params(**best_model)
predictions = cross_val_predict(model, income[all_columns], income[target], cv=kf, )
predictions = pd.Series(predictions)
print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], predictions):0.3f}")
def select_features(df, target, model):
#select numeric and drop NaNs
df_new = df.select_dtypes([np.number]).dropna(axis=1)
all_X = df_new.drop(target, axis=1)
all_y = df_new[target]
#cv is the number of folds
selector = RFECV(model, cv=10)
selector.fit(all_X, all_y)
optimized_columns = list(all_X.columns[selector.support_])
print(f"Best Columns, {type(model).__name__} model: {optimized_columns}\n")
print('----------------------------------------------------\n')
return optimized_columns
model = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
max_depth=5, max_features='auto', class_weight='balanced', n_jobs=-1)
opt_cols_RFC = select_features(income, target, model)
selected_features_manual = ["age", "workclass", "education", "education_num", "marital_status", "occupation",
"relationship", "race", "sex", "hours_per_week", "native_country"]
def select_model(df, target, models_to_fit, refit_metric = 'ROC_AUC'):
lr = LogisticRegression(max_iter = 50000, solver='newton')
rf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
max_depth=5, max_features='auto', class_weight='balanced')
gnb = GaussianNB()
gb = GradientBoostingClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
max_depth=5, max_features='auto')
ab = AdaBoostClassifier(n_estimators=150, random_state=1, learning_rate=0.5, algorithm='SAMME.R')
dicts= [ {
"name": "LogisticRegression",
"estimator": LogisticRegression(max_iter = 50000),
"hyperparameters":
{
"solver": ["newton-cg", "lbfgs", "liblinear"],
"class_weight": ["balanced", ""]
}
},
{
"name": "GaussianNB",
"estimator": GaussianNB(),
"hyperparameters":
{
"var_smoothing": [1.e-8,1.e-9,1.e-10]
}
},
{
"name": "KNeighborsClassifier",
"estimator": KNeighborsClassifier(),
"hyperparameters":
{
"n_neighbors": range(1,20,2),
"weights": ["distance", "uniform"],
"algorithm": ["ball_tree", "kd_tree", "brute"],
"p": [1,2]
}
},
{
"name": "BaggingClassifier",
"estimator": BaggingClassifier(KNeighborsClassifier(algorithm='kd_tree', n_neighbors=13,
p=2, weights='uniform')),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"max_samples" :[0.1, 0.5, 0.8],
"max_features" :[0.1, 0.5, 0.8],
"bootstrap" :[True, False],
"warm_start" :[True, False]
}
},
{
"name": "GradientBoostingClassifier",
"estimator": GradientBoostingClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"max_features": ["auto", "log2", "sqrt"],
"learning_rate":[0.01, 0.05, 0.1, 0.5],
"subsample":[0.1, 0.5, 1.0],
"random_state":[1]
}
},
{
"name": "AdaBoostClassifier",
"estimator": AdaBoostClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"learning_rate":[0.01, 0.05, 0.1, 0.5],
"algorithm": ['SAMME','SAMME.R'],
"random_state":[1]
}
},
{
"name": "DecisionTreeClassifier",
"estimator": DecisionTreeClassifier(),
"hyperparameters":
{
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": range(1,20,3),
"min_samples_split": range(2,20,3),
"class_weight": [None, "balanced"]
}
},
{
"name": "ExtraTreesClassifier",
"estimator": ExtraTreesClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5],
"class_weight": [None, "balanced", {0: 1, 1: 3}, {0: 1, 1: 5}]
}
},
{
"name": "RandomForestClassifier",
"estimator": RandomForestClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5],
"class_weight": [None, "balanced", {0: 1, 1: 3}, {0: 1, 1: 5}]
}
},
{
"name": "VotingClassifier",
"estimator": VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('ab', ab)]),
"hyperparameters":
{
"voting": ["soft"]
}
}]
scoring = {'ROC_AUC':'roc_auc', 'Accuracy':'accuracy',
'recall': make_scorer(recall_score, average='weighted'),
'recall_1': make_scorer(recall_score, pos_label=1),
'recall_0': make_scorer(recall_score, pos_label=0)}
all_y = df[target]
for key, models_list in models_to_fit.items():
print(key)
print('-'*len(key))
start = time.time()
for element in dicts:
if models_list[0] == element['name']:
all_X = df[models_list[1]]
model = element['estimator']
grid = GridSearchCV(model, element['hyperparameters'], cv=10, scoring=scoring,
refit=refit_metric, iid=True, n_jobs=1)
grid.fit(all_X, all_y)
element['best_params'] = grid.best_params_
element['best_score'] = grid.best_score_
element['best_estimator'] = grid.best_estimator_
#get the index of the best model in the list of scores
results = grid.cv_results_
best_index = np.nonzero(results['rank_test_%s' % refit_metric] == 1)[0][0]
#print best score for Metric used to choose best model
print(f"{refit_metric}:{max(results['mean_test_'+refit_metric]):0.3f} ", end="")
print(f"(Metric used to choose best model)\n")
#loop over the scores for all metrics for all models
print(f"{' ':10} best_value | value_best_model")
for scorer in scoring:
#print scores for other metrics
if(scorer!=refit_metric):
score_best_model = results['mean_test_%s' % scorer][best_index]
print(f"{scorer+':':<15} {max(results['mean_test_'+scorer]):0.3f} | {score_best_model:0.3f}")
print("")
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Score: {grid.best_score_:0.3f}\n")
print(f"Time elapsed: {(time.time()-start)/60.:0.2f} mins\n")
print("="*110)
print("\n")
#for scorer in scoring:
# print(cv_results_'_<scorer_name>')
return dicts
models_to_fit = {'VotingClassifier': ['VotingClassifier', opt_cols_RFC],
'LogisticRegression': ['LogisticRegression', opt_cols_RFC],
'GaussianNB': ['GaussianNB', opt_cols_RFC],
'KNeighborsClassifier': ['KNeighborsClassifier', opt_cols_RFC],
'BaggingClassifier': ['BaggingClassifier', opt_cols_RFC],
'GradientBoostingClassifier': ['GradientBoostingClassifier', opt_cols_RFC],
'AdaBoostClassifier': ['AdaBoostClassifier', opt_cols_RFC],
'DecisionTreeClassifier': ['DecisionTreeClassifier', opt_cols_RFC],
'ExtraTreesClassifier': ['ExtraTreesClassifier', opt_cols_RFC],
'RandomForestClassifier': ['RandomForestClassifier', opt_cols_RFC]
}
#models_to_fit = {'DecisionTreeClassifier': ['DecisionTreeClassifier', opt_cols_RFC]}
max_train_row = int(len(income)*0.1)
print(f"Number of samples to train on: {max_train_row}\n")
model_dicts = select_model(income[:max_train_row], target, models_to_fit, refit_metric = 'ROC_AUC')
print("model selection finished")
max_train_row = int(len(income)*0.1)
print(f"Number of samples to train on: {max_train_row}\n")
model_dicts = select_model(income[:max_train_row], target, models_to_fit, refit_metric = 'recall_1')
print("model selection finished")
best_model = {'learning_rate': 0.05, 'max_features': 'log2', 'n_estimators': 100, 'random_state': 1,
'subsample': 1.0}
model_0 = GradientBoostingClassifier()
model_0.set_params(**best_model)
best_model = {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt',
'min_samples_leaf': 19, 'min_samples_split': 2}
model_1 = DecisionTreeClassifier()
model_1.set_params(**best_model)
best_model = {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10,
'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 2,
'n_estimators': 100, 'n_jobs': -1}
model_2 = ExtraTreesClassifier()
model_2.set_params(**best_model)
best_model = {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2',
'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1, 'random_state':1}
model_3 = RandomForestClassifier()
model_3.set_params(**best_model)
#Models optimized for recall_1
best_model = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt',
'min_samples_leaf': 4, 'min_samples_split': 2}
model_4 = DecisionTreeClassifier()
model_4.set_params(**best_model)
best_model = {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 5, 'n_jobs': -1}
model_5 = RandomForestClassifier()
model_5.set_params(**best_model)
print("models initialized")
models = [model_0, model_1, model_3]
kf = KFold(10, shuffle=True, random_state=1)
for model in models:
predictions = cross_val_predict(model, income[optimized_columns_RFC], income[target], cv=kf, )
predictions = pd.Series(predictions)
pred_proba = cross_val_predict(model, income[optimized_columns_RFC], income[target],cv=kf,method='predict_proba')
print(f"Results for {type(model).__name__}:\n")
print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], pred_proba[:,1]):0.3f}\n")
print('------------------------------------------------------\n\n')
models = [model_4, model_5]
kf = KFold(10, shuffle=True, random_state=1)
for model in models:
predictions = cross_val_predict(model, income[optimized_columns_RFC], income[target], cv=kf, )
predictions = pd.Series(predictions)
pred_proba = cross_val_predict(model, income[optimized_columns_RFC], income[target],cv=kf,method='predict_proba')
print(f"Results for {type(model).__name__}:\n")
print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], pred_proba[:,1]):0.3f}\n")
print('------------------------------------------------------\n\n')