Project: ML - All Ensemble Classifiers + GaussianNB & KNN (Predict Income from US census)¶

Problem:¶

Predict if someone has high income or not from US census data: age, education, sex, race, occupation, etc
Binary classification using all types of ensemble classifiers from sklearn + GaussianNB and KNN

Tools:¶

Feature Engineering: transform variables to categorical and reshuffle df, RFECV
Models: BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, GaussianNB, DecisionTreeClassifier, LogisticRegression, KNeighborsClassifier
Model validation and hyperparameter search: GridSearchCV
Error Metrics: ROC_AUC, precision, recall, f1, classification_report & confusion_matrix

load defaults¶

import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests 
import time

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math

from functions import *
import myML_functions as myML_functions

plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})

colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))


#specific to this project
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score

from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier



print("Defaults Loaded")

Defaults Loaded

Dataset: US census, predict high or low income¶

# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
income = pd.read_csv("./data/income.csv", index_col=False)
display(income.iloc[:2,[0,1,2,3,4,5,6,7,8,9,10,11,-1]])

# Convert text categories to numbers
cols = ['workclass', 'education', 'marital_status', 'occupation', 
        'relationship', 'race', 'sex', 'native_country', 'high_income']

for element in cols:
    #col = pandas.Categorical.from_array(income[element])
    col = pd.Categorical(income[element])
    income[element] = col.codes

#remove columns without interesting infomration information
income.drop(['fnlwgt', 'capital_gain', 'capital_loss'], axis=1, inplace=True)
display(income[:3])

#columns to train with (all have been converted to numeric)
all_columns = ["age", "workclass", "education", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "hours_per_week", "native_country"]
target = 'high_income'

# Shuffle the rows  
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

1 - Decision Tree Classifier with holdout validation to check for overfitting¶

#create train and test set for holdout validation
train = income[0:int(len(income)*0.8)]
test = income[int(len(income)*0.8):]

# Instantiate the classifier (Set random_state to 1 to be able to replicate)
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[all_columns], train["high_income"])

predictions = clf.predict(test[all_columns])
test_auc = roc_auc_score(test['high_income'], predictions)

predictions = clf.predict(train[all_columns])
train_auc = roc_auc_score(train['high_income'], predictions)
print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))

AUC train:0.947, AUC test:0.694

The predictions on train set are significantly better, hint of overfitting. To avoid overfittin restrict max_depth and min_samples_split

clf = DecisionTreeClassifier(random_state=1, max_depth=7, min_samples_split=13)
clf.fit(train[all_columns], train["high_income"])
predictions = clf.predict(test[all_columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[all_columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)

print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))

AUC train:0.748, AUC test:0.744

2 - cross_val_predict Decision Tree Classifier & Random Forest Classifier¶

Decision Tree Classifier

kf = KFold(10, shuffle=True, random_state=1)

best_model =  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 
               'min_samples_leaf': 5, 'min_samples_split': 13, 'random_state':1}
model = DecisionTreeClassifier()
model.set_params(**best_model)  

predictions = cross_val_predict(model, income[all_columns], income[target], cv=kf, )
predictions = pd.Series(predictions)

print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], predictions):0.3f}")

              precision    recall  f1-score   support

           0       0.94      0.72      0.82     24720
           1       0.50      0.85      0.63      7841

   micro avg       0.76      0.76      0.76     32561
   macro avg       0.72      0.79      0.72     32561
weighted avg       0.83      0.76      0.77     32561

    true / pred  Low Income High Income 
     Low Income     17908.0      6812.0 
    High Income      1148.0      6693.0 

ROC_AUC: 0.789

Random Forest Classifier

kf = KFold(10, shuffle=True, random_state=1)

best_model =  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 
               'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 150, 'n_jobs': -1, 'random_state':1}
model = RandomForestClassifier()
model.set_params(**best_model)  

predictions = cross_val_predict(model, income[all_columns], income[target], cv=kf, )
predictions = pd.Series(predictions)

print(classification_report(income[target],predictions))
cm = confusion_matrix(income[target],predictions)
myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
print(f"\nROC_AUC: {roc_auc_score(income[target], predictions):0.3f}")

              precision    recall  f1-score   support

           0       0.95      0.71      0.81     24720
           1       0.49      0.88      0.63      7841

   micro avg       0.75      0.75      0.75     32561
   macro avg       0.72      0.80      0.72     32561
weighted avg       0.84      0.75      0.77     32561

    true / pred  Low Income High Income 
     Low Income     17631.0      7089.0 
    High Income       939.0      6902.0 

ROC_AUC: 0.797

3 - RFECV for Decision Tree & Random Forest Classifier¶

def select_features(df, target, model):    
    #select numeric and drop NaNs
    df_new = df.select_dtypes([np.number]).dropna(axis=1)
  
    all_X = df_new.drop(target, axis=1)
    all_y = df_new[target]
    
    #cv is the number of folds
    selector = RFECV(model, cv=10)
    selector.fit(all_X, all_y)    
    optimized_columns = list(all_X.columns[selector.support_])
    
    print(f"Best Columns, {type(model).__name__} model: {optimized_columns}\n")          
    print('----------------------------------------------------\n')      
    
    return optimized_columns

model = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
                               max_depth=5, max_features='auto', class_weight='balanced', n_jobs=-1)
opt_cols_RFC = select_features(income, target, model)

Best Columns, RandomForestClassifier model: ['age', 'education', 'education_num', 'marital_status', 'relationship', 'hours_per_week']

----------------------------------------------------

selected_features_manual = ["age", "workclass", "education", "education_num", "marital_status", "occupation", 
                            "relationship", "race", "sex", "hours_per_week", "native_country"]

4 - Model Selection with GridSearchCV¶

def select_model(df, target, models_to_fit, refit_metric = 'ROC_AUC'): 
    
    lr = LogisticRegression(max_iter = 50000, solver='newton')
    rf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
                                max_depth=5, max_features='auto', class_weight='balanced') 
    gnb = GaussianNB()
    gb = GradientBoostingClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, min_samples_split=2,
                                    max_depth=5, max_features='auto')
    ab = AdaBoostClassifier(n_estimators=150, random_state=1, learning_rate=0.5, algorithm='SAMME.R')
    
    dicts= [ {
               "name": "LogisticRegression",
               "estimator": LogisticRegression(max_iter = 50000),
               "hyperparameters": 
                 {                
                   "solver": ["newton-cg", "lbfgs", "liblinear"],  
                   "class_weight": ["balanced", ""]                      
                 }
             },
             {
               "name": "GaussianNB",
               "estimator": GaussianNB(),
               "hyperparameters": 
                 {                   
                   "var_smoothing": [1.e-8,1.e-9,1.e-10]                      
                 }
             },
             {
               "name": "KNeighborsClassifier",
               "estimator": KNeighborsClassifier(),
               "hyperparameters": 
                 {
                   "n_neighbors": range(1,20,2),
                   "weights": ["distance", "uniform"],
                   "algorithm": ["ball_tree", "kd_tree", "brute"],
                   "p": [1,2]
                 }
             },
             {
               "name": "BaggingClassifier",
               "estimator": BaggingClassifier(KNeighborsClassifier(algorithm='kd_tree', n_neighbors=13, 
                                                                   p=2, weights='uniform')),
               "hyperparameters": 
                 {
                   "n_estimators": [5, 20, 100],  
                   "max_samples" :[0.1, 0.5, 0.8],
                   "max_features" :[0.1, 0.5, 0.8],   
                   "bootstrap" :[True, False],
                   "warm_start" :[True, False]
                 }
             },
             {
               "name": "GradientBoostingClassifier",
               "estimator": GradientBoostingClassifier(),
               "hyperparameters": 
                 {
                   "n_estimators": [5, 20, 100],  
                   "max_features": ["auto", "log2", "sqrt"],
                   "learning_rate":[0.01, 0.05, 0.1, 0.5],
                   "subsample":[0.1, 0.5, 1.0],  
                   "random_state":[1]     
                 }
             },
             {
               "name": "AdaBoostClassifier",
               "estimator": AdaBoostClassifier(),
               "hyperparameters": 
                 {
                   "n_estimators": [5, 20, 100],                     
                   "learning_rate":[0.01, 0.05, 0.1, 0.5],
                   "algorithm": ['SAMME','SAMME.R'],  
                   "random_state":[1]     
                 }
             },
             {
               "name": "DecisionTreeClassifier",
               "estimator": DecisionTreeClassifier(),
               "hyperparameters": 
                 {                   
                   "criterion": ["entropy", "gini"],
                   "max_depth": [2, 5, 10],
                   "max_features": ["log2", "sqrt"],
                   "min_samples_leaf": range(1,20,3),
                   "min_samples_split": range(2,20,3), 
                   "class_weight": [None, "balanced"]               
                 }
             },
             {
               "name": "ExtraTreesClassifier",
               "estimator": ExtraTreesClassifier(),
               "hyperparameters": 
                 {     
                   "n_estimators": [5, 20, 100],  
                   "criterion": ["entropy", "gini"],
                   "max_depth": [2, 5, 10],
                   "max_features": ["log2", "sqrt"],
                   "min_samples_leaf": [1, 5, 8],
                   "min_samples_split": [2, 3, 5], 
                   "class_weight": [None, "balanced", {0: 1, 1: 3}, {0: 1, 1: 5}]               
                 }
             },
             {
               "name": "RandomForestClassifier",
               "estimator": RandomForestClassifier(),
               "hyperparameters": 
                 {     
                   "n_estimators": [5, 20, 100],  
                   "criterion": ["entropy", "gini"],
                   "max_depth": [2, 5, 10],
                   "max_features": ["log2", "sqrt"],
                   "min_samples_leaf": [1, 5, 8],
                   "min_samples_split": [2, 3, 5], 
                   "class_weight": [None, "balanced", {0: 1, 1: 3}, {0: 1, 1: 5}]               
                 }
             },
             {
               "name": "VotingClassifier",
               "estimator": VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('ab', ab)]),              
               "hyperparameters": 
                 {     
                   "voting": ["soft"]           
                 }
                
             }]    
  
    scoring = {'ROC_AUC':'roc_auc', 'Accuracy':'accuracy', 
               'recall': make_scorer(recall_score, average='weighted'),
               'recall_1': make_scorer(recall_score, pos_label=1),              
               'recall_0': make_scorer(recall_score, pos_label=0)}
        
    all_y = df[target]    
    for key, models_list in models_to_fit.items():        
        print(key)
        print('-'*len(key))
        start = time.time()
        for element in dicts:
            if models_list[0] == element['name']:               
                all_X = df[models_list[1]]   
                model = element['estimator']                
                grid = GridSearchCV(model, element['hyperparameters'], cv=10, scoring=scoring, 
                                    refit=refit_metric, iid=True, n_jobs=1)
                grid.fit(all_X, all_y)
                
                element['best_params'] = grid.best_params_
                element['best_score'] = grid.best_score_
                element['best_estimator'] = grid.best_estimator_  
                
                #get the index of the best model in the list of scores
                results = grid.cv_results_
                best_index = np.nonzero(results['rank_test_%s' % refit_metric] == 1)[0][0]
                
                #print best score for Metric used to choose best model
                print(f"{refit_metric}:{max(results['mean_test_'+refit_metric]):0.3f} ", end="")
                print(f"(Metric used to choose best model)\n") 
                
                #loop over the scores for all metrics for all models
                print(f"{' ':10} best_value | value_best_model")
                for scorer in scoring:   
                    #print scores for other metrics 
                    if(scorer!=refit_metric):                                                 
                        score_best_model = results['mean_test_%s' % scorer][best_index]
                        print(f"{scorer+':':<15} {max(results['mean_test_'+scorer]):0.3f} | {score_best_model:0.3f}")
                print("")       
                print(f"Best Parameters: {grid.best_params_}")
                print(f"Best Score: {grid.best_score_:0.3f}\n")
        
        print(f"Time elapsed: {(time.time()-start)/60.:0.2f} mins\n")
        print("="*110)
        print("\n")
        #for scorer in scoring:
        #    print(cv_results_'_<scorer_name>')
       
    return dicts
 
models_to_fit = {'VotingClassifier': ['VotingClassifier', opt_cols_RFC],
                 'LogisticRegression': ['LogisticRegression', opt_cols_RFC], 
                 'GaussianNB': ['GaussianNB', opt_cols_RFC], 
                 'KNeighborsClassifier': ['KNeighborsClassifier', opt_cols_RFC],
                 'BaggingClassifier': ['BaggingClassifier', opt_cols_RFC],                 
                 'GradientBoostingClassifier': ['GradientBoostingClassifier', opt_cols_RFC],
                 'AdaBoostClassifier': ['AdaBoostClassifier', opt_cols_RFC],
                 'DecisionTreeClassifier': ['DecisionTreeClassifier', opt_cols_RFC],
                 'ExtraTreesClassifier': ['ExtraTreesClassifier', opt_cols_RFC],                
                 'RandomForestClassifier': ['RandomForestClassifier', opt_cols_RFC]
                 
                }    
                         
#models_to_fit = {'DecisionTreeClassifier': ['DecisionTreeClassifier', opt_cols_RFC]} 
                  
max_train_row = int(len(income)*0.1)     
                          
print(f"Number of samples to train on: {max_train_row}\n")                          
model_dicts = select_model(income[:max_train_row], target, models_to_fit, refit_metric = 'ROC_AUC')


print("model selection finished")

Number of samples to train on: 3256

VotingClassifier
----------------
ROC_AUC:0.863 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.807 | 0.807
recall:         0.807 | 0.807
recall_1:       0.695 | 0.695
recall_0:       0.840 | 0.840

Best Parameters: {'voting': 'soft'}
Best Score: 0.863

Time elapsed: 0.29 mins

==============================================================================================================


LogisticRegression
------------------
ROC_AUC:0.814 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.808 | 0.727
recall:         0.808 | 0.727
recall_1:       0.765 | 0.764
recall_0:       0.946 | 0.717

Best Parameters: {'class_weight': 'balanced', 'solver': 'newton-cg'}
Best Score: 0.814

Time elapsed: 0.05 mins

==============================================================================================================


GaussianNB
----------
ROC_AUC:0.848 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.807 | 0.807
recall:         0.807 | 0.807
recall_1:       0.670 | 0.670
recall_0:       0.847 | 0.847

Best Parameters: {'var_smoothing': 1e-08}
Best Score: 0.848

Time elapsed: 0.02 mins

==============================================================================================================


KNeighborsClassifier
--------------------
ROC_AUC:0.838 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.817 | 0.811
recall:         0.817 | 0.811
recall_1:       0.491 | 0.455
recall_0:       0.919 | 0.915

Best Parameters: {'algorithm': 'brute', 'n_neighbors': 19, 'p': 1, 'weights': 'uniform'}
Best Score: 0.838

Time elapsed: 13.06 mins

==============================================================================================================


BaggingClassifier
-----------------
ROC_AUC:0.869 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.822 | 0.781
recall:         0.822 | 0.781
recall_1:       0.382 | 0.042
recall_0:       1.000 | 0.999

Best Parameters: {'bootstrap': False, 'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 100, 'warm_start': False}
Best Score: 0.869

Time elapsed: 45.20 mins

==============================================================================================================


GradientBoostingClassifier
--------------------------
ROC_AUC:0.869 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.826 | 0.823
recall:         0.826 | 0.823
recall_1:       0.509 | 0.438
recall_0:       1.000 | 0.936

Best Parameters: {'learning_rate': 0.05, 'max_features': 'log2', 'n_estimators': 100, 'random_state': 1, 'subsample': 1.0}
Best Score: 0.869

Time elapsed: 1.40 mins

==============================================================================================================


AdaBoostClassifier
------------------
ROC_AUC:0.868 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.822 | 0.822
recall:         0.822 | 0.822
recall_1:       0.445 | 0.357
recall_0:       1.000 | 0.959

Best Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 1}
Best Score: 0.868

Time elapsed: 0.57 mins

==============================================================================================================


DecisionTreeClassifier
----------------------
ROC_AUC:0.855 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.821 | 0.754
recall:         0.821 | 0.754
recall_1:       0.877 | 0.839
recall_0:       0.994 | 0.729

Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 19, 'min_samples_split': 2}
Best Score: 0.855

Time elapsed: 3.34 mins

==============================================================================================================


ExtraTreesClassifier
--------------------
ROC_AUC:0.868 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.826 | 0.743
recall:         0.826 | 0.743
recall_1:       0.969 | 0.863
recall_0:       1.000 | 0.708

Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 20}
Best Score: 0.868

Time elapsed: 27.07 mins

==============================================================================================================


RandomForestClassifier
----------------------
ROC_AUC:0.868 (Metric used to choose best model)

           best_value | value_best_model
Accuracy:       0.826 | 0.822
recall:         0.826 | 0.822
recall_1:       0.924 | 0.405
recall_0:       0.991 | 0.944

Best Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.868

Time elapsed: 30.04 mins

==============================================================================================================


model selection finished

refit = recall_1¶

max_train_row = int(len(income)*0.1)     
                          
print(f"Number of samples to train on: {max_train_row}\n")                          
model_dicts = select_model(income[:max_train_row], target, models_to_fit, refit_metric = 'recall_1')


print("model selection finished")

Number of samples to train on: 3256

VotingClassifier
----------------
recall_1:0.695 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.863 | 0.863
Accuracy:       0.807 | 0.807
recall:         0.807 | 0.807
recall_0:       0.840 | 0.840

Best Parameters: {'voting': 'soft'}
Best Score: 0.695

Time elapsed: 0.24 mins

==============================================================================================================


LogisticRegression
------------------
recall_1:0.765 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.814 | 0.813
Accuracy:       0.808 | 0.725
recall:         0.808 | 0.725
recall_0:       0.946 | 0.713

Best Parameters: {'class_weight': 'balanced', 'solver': 'liblinear'}
Best Score: 0.765

Time elapsed: 0.04 mins

==============================================================================================================


GaussianNB
----------
recall_1:0.670 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.848 | 0.848
Accuracy:       0.807 | 0.807
recall:         0.807 | 0.807
recall_0:       0.847 | 0.847

Best Parameters: {'var_smoothing': 1e-08}
Best Score: 0.670

Time elapsed: 0.01 mins

==============================================================================================================


KNeighborsClassifier
--------------------
recall_1:0.491 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.838 | 0.669
Accuracy:       0.817 | 0.766
recall:         0.817 | 0.766
recall_0:       0.919 | 0.847

Best Parameters: {'algorithm': 'kd_tree', 'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
Best Score: 0.491

Time elapsed: 11.94 mins

==============================================================================================================


BaggingClassifier
-----------------
recall_1:0.391 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.869 | 0.849
Accuracy:       0.822 | 0.818
recall:         0.822 | 0.818
recall_0:       1.000 | 0.944

Best Parameters: {'bootstrap': True, 'max_features': 0.5, 'max_samples': 0.8, 'n_estimators': 5, 'warm_start': False}
Best Score: 0.391

Time elapsed: 47.59 mins

==============================================================================================================


GradientBoostingClassifier
--------------------------
recall_1:0.509 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.869 | 0.822
Accuracy:       0.826 | 0.799
recall:         0.826 | 0.799
recall_0:       1.000 | 0.884

Best Parameters: {'learning_rate': 0.5, 'max_features': 'log2', 'n_estimators': 20, 'random_state': 1, 'subsample': 0.1}
Best Score: 0.509

Time elapsed: 1.57 mins

==============================================================================================================


AdaBoostClassifier
------------------
recall_1:0.445 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.868 | 0.867
Accuracy:       0.822 | 0.821
recall:         0.822 | 0.821
recall_0:       1.000 | 0.932

Best Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.5, 'n_estimators': 100, 'random_state': 1}
Best Score: 0.445

Time elapsed: 0.58 mins

==============================================================================================================


DecisionTreeClassifier
----------------------
recall_1:0.907 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.855 | 0.769
Accuracy:       0.821 | 0.624
recall:         0.821 | 0.624
recall_0:       1.000 | 0.541

Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Score: 0.907

Time elapsed: 3.52 mins

==============================================================================================================


ExtraTreesClassifier
--------------------
recall_1:0.972 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.869 | 0.848
Accuracy:       0.827 | 0.589
recall:         0.827 | 0.589
recall_0:       1.000 | 0.477

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'criterion': 'entropy', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.972

Time elapsed: 33.96 mins

==============================================================================================================


RandomForestClassifier
----------------------
recall_1:0.923 (Metric used to choose best model)

           best_value | value_best_model
ROC_AUC:        0.869 | 0.840
Accuracy:       0.828 | 0.632
recall:         0.828 | 0.632
recall_0:       0.993 | 0.547

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 5}
Best Score: 0.923

Time elapsed: 34.61 mins

==============================================================================================================


model selection finished

5 - cross_val_predict for best models (GradientBoostingClassifier, DecisionTreeClassifier, ExtraTreesClassifier, RandomForestClassifier)¶

best_model =  {'learning_rate': 0.05, 'max_features': 'log2', 'n_estimators': 100, 'random_state': 1, 
               'subsample': 1.0}
model_0 = GradientBoostingClassifier()
model_0.set_params(**best_model)  

best_model =  {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 
               'min_samples_leaf': 19, 'min_samples_split': 2}
model_1 = DecisionTreeClassifier()
model_1.set_params(**best_model)  

best_model =  {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 
               'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 2, 
               'n_estimators': 100, 'n_jobs': -1}
model_2 = ExtraTreesClassifier()
model_2.set_params(**best_model)  

best_model =  {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 
               'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1, 'random_state':1}

model_3 = RandomForestClassifier()
model_3.set_params(**best_model)  


#Models optimized for recall_1
best_model =  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 
               'min_samples_leaf': 4, 'min_samples_split': 2}
model_4 = DecisionTreeClassifier()
model_4.set_params(**best_model)   

best_model =  {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'log2', 
               'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 5, 'n_jobs': -1}

model_5 = RandomForestClassifier()
model_5.set_params(**best_model)  

print("models initialized")

models initialized

best models for optimized ROC_AUC¶

models = [model_0, model_1, model_3]
kf = KFold(10, shuffle=True, random_state=1)

for model in models:
    predictions = cross_val_predict(model, income[optimized_columns_RFC], income[target], cv=kf, )
    predictions = pd.Series(predictions)
    pred_proba = cross_val_predict(model, income[optimized_columns_RFC], income[target],cv=kf,method='predict_proba')

    print(f"Results for {type(model).__name__}:\n")
    print(classification_report(income[target],predictions))
    cm = confusion_matrix(income[target],predictions)
    myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
    print(f"\nROC_AUC: {roc_auc_score(income[target], pred_proba[:,1]):0.3f}\n") 
    print('------------------------------------------------------\n\n')

Results for GradientBoostingClassifier:

              precision    recall  f1-score   support

           0       0.85      0.94      0.89     24720
           1       0.71      0.49      0.58      7841

   micro avg       0.83      0.83      0.83     32561
   macro avg       0.78      0.71      0.74     32561
weighted avg       0.82      0.83      0.82     32561

    true / pred  Low Income High Income 
     Low Income     23176.0      1544.0 
    High Income      4001.0      3840.0 

ROC_AUC: 0.883

------------------------------------------------------


Results for DecisionTreeClassifier:

              precision    recall  f1-score   support

           0       0.85      0.92      0.88     24720
           1       0.66      0.49      0.56      7841

   micro avg       0.82      0.82      0.82     32561
   macro avg       0.76      0.71      0.72     32561
weighted avg       0.81      0.82      0.81     32561

    true / pred  Low Income High Income 
     Low Income     22727.0      1993.0 
    High Income      3974.0      3867.0 

ROC_AUC: 0.869

------------------------------------------------------


Results for RandomForestClassifier:

              precision    recall  f1-score   support

           0       0.84      0.95      0.89     24720
           1       0.74      0.41      0.52      7841

   micro avg       0.82      0.82      0.82     32561
   macro avg       0.79      0.68      0.71     32561
weighted avg       0.81      0.82      0.80     32561

    true / pred  Low Income High Income 
     Low Income     23580.0      1140.0 
    High Income      4658.0      3183.0 

ROC_AUC: 0.877

------------------------------------------------------

best models for optimized Recall_1¶

models = [model_4, model_5]
kf = KFold(10, shuffle=True, random_state=1)

for model in models:
    predictions = cross_val_predict(model, income[optimized_columns_RFC], income[target], cv=kf, )
    predictions = pd.Series(predictions)
    pred_proba = cross_val_predict(model, income[optimized_columns_RFC], income[target],cv=kf,method='predict_proba')

    print(f"Results for {type(model).__name__}:\n")
    print(classification_report(income[target],predictions))
    cm = confusion_matrix(income[target],predictions)
    myML_functions.print_cm(cm, labels=['Low Income', 'High Income'])
    print(f"\nROC_AUC: {roc_auc_score(income[target], pred_proba[:,1]):0.3f}\n") 
    print('------------------------------------------------------\n\n')

Results for DecisionTreeClassifier:

              precision    recall  f1-score   support

           0       0.90      0.71      0.79     24720
           1       0.45      0.75      0.56      7841

   micro avg       0.72      0.72      0.72     32561
   macro avg       0.67      0.73      0.68     32561
weighted avg       0.79      0.72      0.74     32561

    true / pred  Low Income High Income 
     Low Income     17469.0      7251.0 
    High Income      1938.0      5903.0 

ROC_AUC: 0.793

------------------------------------------------------


Results for RandomForestClassifier:

              precision    recall  f1-score   support

           0       0.95      0.56      0.70     24720
           1       0.39      0.90      0.55      7841

   micro avg       0.64      0.64      0.64     32561
   macro avg       0.67      0.73      0.62     32561
weighted avg       0.81      0.64      0.66     32561

    true / pred  Low Income High Income 
     Low Income     13788.0     10932.0 
    High Income       764.0      7077.0 

ROC_AUC: 0.836

------------------------------------------------------

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	capital_loss	high_income
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	0	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	<=50K