import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
import myML_functions as myML_functions
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
print("Defaults Loaded")
cars = pd.read_csv("./data/auto.csv")
#find unique values
unique_regions = cars['origin'].unique()
print(cars['origin'].value_counts())
dummy coding
#categoric columns: cylinders, year, origin
#set prefix
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
dummy_years = pd.get_dummies(cars['year'], prefix='year')
cars = pd.concat([cars, dummy_years], axis = 1)
cars.drop(['cylinders','year'], axis=1, inplace=True)
display(cars.iloc[:3,:15])
shuffle rows and split into train and test
np.random.seed(1)
shuffled_cars = cars.iloc[np.random.permutation(len(cars))]
train = shuffled_cars[0: int(0.7*len(shuffled_cars))]
test = shuffled_cars[int(0.7*len(shuffled_cars)):]
df_clean = shuffled_cars
target = 'origin'
target_df = df_clean['origin']
print("done")
One versus all method for Multiclass Classification: choose one category as the positive case and group all others in the False case
then, for each observations choose the model the label (prediction) with highest probability
unique_origins = cars["origin"].unique()
unique_origins.sort()
models = {}
#train model just with year and cylinder dummy columns
cols = train.columns
cols_to_keep = (cols.str.contains('cyl') | cols.str.contains('year'))
testing_probs = pd.DataFrame(columns=unique_origins)
#train on train set
X = train[cols[cols_to_keep]]
for element in unique_origins:
#select only rows with current label
y = train['origin'] == element
models[element] = LogisticRegression(solver='lbfgs')
models[element].fit(X, y)
print("models fitted")
#predict
y = test[cols[cols_to_keep]]
for element in unique_origins:
testing_probs[element]=models[element].predict_proba(y)[:,1]
display(testing_probs[:5])
predicted_origins = testing_probs.idxmax(axis=1)
cars['predicted_label'] = predicted_origins
cars.dropna(axis=0, inplace=True, how='any')
cars['predicted_label'] = cars['predicted_label'].astype('int')
display(cars['predicted_label'].iloc[:5])
Accuracy
cars = cars.rename(columns={'origin': 'actual_label'})
matches = cars['actual_label']==cars['predicted_label']
correct_predictions = cars[matches]
accuracy = len(correct_predictions)/len(cars)
print("Accuracy = {:0.3f}".format(accuracy))
Recall/Sensitivity(True Positive Rate), $TPR = \frac{True\,Positives}{True\,Positives+False\,Negatives}$
true_positives = 0
for element in cars["actual_label"].unique():
my_filter = (cars["predicted_label"] == element) & (cars["actual_label"] == element)
true_positives += len(cars[my_filter])
false_negatives = 0
for element in cars["actual_label"].unique():
my_filter = (cars["predicted_label"] != element) & (cars["actual_label"] == element)
false_negatives += len(cars[my_filter])
sensitivity = (true_positives)/(true_positives+false_negatives)
print("Sensitivity = {:0.3f}".format(sensitivity))
Specificity (True Negative rate), $TNR = \frac{True\,Negatives}{False\,Positives+True\,Negatives}$
true_negatives = 0
for element in cars["actual_label"].unique():
my_filter = (cars["predicted_label"] != element) & (cars["actual_label"] != element)
true_negatives += len(cars[my_filter])
false_positives = 0
for element in cars["actual_label"].unique():
my_filter = (cars["predicted_label"] == element) & (cars["actual_label"] != element)
false_positives += len(cars[my_filter])
specificity = true_negatives/(true_negatives+false_positives)
print("Specificity = {:0.3f}".format(specificity))
Matthew's Correlation Coefficient, $MCC = \frac{TP.TN-FP.FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}$
MCC = ((true_positives*true_negatives-false_positives*false_negatives)/
np.sqrt((true_positives+false_positives)*(true_positives+false_negatives)*
(true_negatives+false_positives)*(true_negatives+false_negatives)))
print("MCC = {:0.3f}".format(MCC))
print(classification_report(cars['actual_label'],cars['predicted_label']))
cm = confusion_matrix(cars['actual_label'], cars['predicted_label'], labels=[1, 2, 3])
myML_functions.print_cm(cm, labels=['USA (1)', 'Europe (2)', ' Asia (3)'])
model = LogisticRegression(solver = 'lbfgs', class_weight='balanced', max_iter=3000, multi_class='ovr')
optimized_columns_LR = myML_functions.select_features_RFECV(df_clean, target, model)
model = RandomForestClassifier(n_estimators=50, random_state=1, min_samples_leaf=5, class_weight='balanced')
optimized_columns_RFC = myML_functions.select_features_RFECV(df_clean, target, model)
model = GradientBoostingClassifier(learning_rate=0.01, n_estimators=50,subsample=0.6,random_state=42)
optimized_columns_GBC = myML_functions.select_features_RFECV(df_clean, target, model)
def select_model(df, target, models_to_fit):
dicts= [ {
"name": "LogisticRegression",
"estimator": LogisticRegression(max_iter = 100000, multi_class='auto'),
"hyperparameters":
{
"solver": ["lbfgs", "liblinear"],
"class_weight": ["balanced", ""]
}
},
{
"name": "RandomForestClassifier",
"estimator": RandomForestClassifier(),
"hyperparameters":
{
"n_estimators": [5, 20, 100],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5],
"class_weight": [None, "balanced"]
}
},
{
"name": "GradientBoostingClassifier",
"estimator": GradientBoostingClassifier(),
"hyperparameters":
{
"n_estimators": [5, 2, 10],
"max_features": ["auto", "log2", "sqrt"],
"learning_rate":[0.01, 0.05, 0.1, 0.5],
"subsample":[0.1, 0.5, 1.0],
"random_state":[1]
}
}]
scoring = {'Accuracy':'accuracy',
'precision': make_scorer(precision_score, average='weighted'),
'recall': make_scorer(recall_score, average='weighted'),
'f1': make_scorer(f1_score, average='weighted')}
all_y = df[target]
for key, models_list in models_to_fit.items():
print(key)
print('-'*len(key))
start = time.time()
for element in dicts:
if models_list[0] == element['name']:
all_X = df[models_list[1]]
model = element['estimator']
grid = GridSearchCV(model, element['hyperparameters'], cv=10, scoring=scoring,
refit='f1', iid=True, n_jobs=1)
grid.fit(all_X, all_y)
element['best_params'] = grid.best_params_
element['best_score'] = grid.best_score_
element['best_estimator'] = grid.best_estimator_
for scorer in scoring:
print(f"{scorer}: {max(grid.cv_results_['mean_test_'+scorer]):0.3f}")
print("Best Parameters: {}".format(grid.best_params_))
print("Best Score: {:0.3f}\n".format(grid.best_score_))
print(f"Time elapsed: {(time.time()-start)/60.:0.2f} mins\n\n")
#for scorer in scoring:
# print(cv_results_'_<scorer_name>')
return dicts
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import time
models_to_fit = {'LogisticRegression': ['LogisticRegression', optimized_columns_LR],
'RandomForestClassifier': ['RandomForestClassifier', optimized_columns_RFC],
'GradientBoostingClassifier': ['GradientBoostingClassifier', optimized_columns_GBC]}
model_dicts = select_model(df_clean[:int(len(df_clean))], target, models_to_fit)
print("model selection finished")
kf = KFold(10, shuffle=True, random_state=1)
best_model = {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}
model = RandomForestClassifier()
model.set_params(**best_model)
predictions = cross_val_predict(model, df_clean[optimized_columns_RFC], target_df, cv=kf, )
predictions = pd.Series(predictions)
print(classification_report(target_df, predictions))
cm = confusion_matrix(target_df, predictions, labels=[1, 2, 3])
myML_functions.print_cm(cm, labels=['USA (1)', 'Europe (2)', ' Asia (3)'])