from helpers.clean import *
from helpers.eda import *
from helpers.model import *
from helpers.evaluate import * 

import time
start_notebook = time.time() 

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import ADASYN, \
                                   SMOTE

from sklearn.ensemble import AdaBoostClassifier, \
                             RandomForestClassifier, \
                             VotingClassifier

from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, \
                         plot_tree

from sklearn.metrics import accuracy_score, \
                            classification_report, \
                            confusion_matrix, \
                            ConfusionMatrixDisplay, \
                            f1_score, \
                            make_scorer, \
                            recall_score, \
                            roc_curve, \
                            precision_score

from sklearn.model_selection import cross_val_score, \
                                    train_test_split, \
                                    GridSearchCV    

seed = 42

teal = '#01ACA5'
purple = '#BF40BF'
sns.set_palette(sns.color_palette([teal, purple]))
white_median = dict(color='yellow')

pd.set_option('display.max_columns', 9)

import warnings
warnings.filterwarnings('ignore')


header = ['age', 'class_of_worker', 'industry_code', 'occupation_code', 'education', 'wage_per_hour',
          'enrolled_in_edu_inst_last_wk', 'marital_status', 'major_industry_code',
          'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union',
          'reason_for_unemployment', 'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses',
          'divdends_from_stocks', 'tax_filer_status', 'region_of_previous_residence',
          'state_of_previous_residence', 'detailed_household_and_family_stat',
          'detailed_household_summary_in_household', 'unknown_column', 'migration_code_change_in_msa',
          'migration_code_change_in_reg', 'migration_code_move_within_reg', 'live_in_this_house_1_year_ago',
          'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18',
          'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self',
          'citizenship', 'own_business_or_self_employed', 'fill_inc_questionnaire_for_veterans_admin',
          'veterans_benefits', 'weeks_worked_in_year', 'year', 'taxable_income_amount']

columns_to_drop_eda = ['weeks_worked_in_year',
                       'industry_code',
                       'hispanic_origin',
                       'detailed_household_and_family_stat',
                       'country_of_birth_father',
                       'country_of_birth_mother',
                       'country_of_birth_self',
                       'own_business_or_self_employed',
                       'veterans_benefits',
                       'year']

start_time = time.time()

raw_train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
raw_test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')

train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')

end_time = time.time()
execution_time = end_time - start_time
print(f'Loading time: {round(execution_time)} seconds')

Loading time: 9 seconds


start_time = time.time()

train = clean_df(train, *columns_to_drop_eda)
test = clean_df(test, *columns_to_drop_eda)

end_time = time.time()
execution_time = end_time - start_time
print(f'Cleaning time: {round(execution_time)} seconds')

Cleaning time: 8 seconds


get_cleaning_metrics(raw_train, train, raw_test, test, type_of_test_set='Test')

79785 observations removed from the train set.
39607 observations removed from the Test set.
30 columns removed from the train set.
30 columns removed from the Test set.

Train dataset observations reduced by 40.0%.
Test dataset observations reduced by 39.7%.
Train dataset columns reduced by 71.4%.
Test dataset columns reduced by 71.4%.

The training set has 119738 observations and  12 columns.
The training set has 60155 observations and  12 columns.


fig = plot_imbalance(train)
plt.show()


start_time = time.time()


train_x, train_y =  train.drop('taxable_income_amount', axis=1), train.taxable_income_amount
test_x, test_y = test.drop('taxable_income_amount', axis=1), test.taxable_income_amount


dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, test_x, test_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9013
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     54217
           1       0.00      0.00      0.00      5938

    accuracy                           0.90     60155
   macro avg       0.45      0.50      0.47     60155
weighted avg       0.81      0.90      0.85     60155

[[54217     0]
 [ 5938     0]]


Execution time: 0 seconds


title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=test_y, predicted_y=predicted_y)


train_x_dummies = pd.get_dummies(train_x)
test_x_dummies = pd.get_dummies(test_x)


train_scaler = StandardScaler().fit(train_x_dummies)
train_x_scaled = train_scaler.transform(train_x_dummies)

test_scaler = StandardScaler().fit(test_x_dummies)
test_x_scaled = test_scaler.transform(test_x_dummies)


train_x_split, valid_x, train_y_split, valid_y = train_test_split(train_x_scaled,
                                                                  train_y,
                                                                  test_size=0.33,
                                                                  random_state=42)

train_x = train_x_split
train_y = train_y_split
test_x = test_x_scaled


fig = plot_imbalance(test)
plt.show()


test.taxable_income_amount.value_counts(normalize=True)

0    0.901288
1    0.098712
Name: taxable_income_amount, dtype: float64


model = SMOTE()
smote_x, smote_y = model.fit_resample(train_x, train_y)
smote_y = smote_y.astype('int')


adasyn = ADASYN(random_state=42)
adasyn_x, adasyn_y = adasyn.fit_resample(train_x, train_y)


dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, train_x, train_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9004
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     72230
           1       0.00      0.00      0.00      7994

    accuracy                           0.90     80224
   macro avg       0.45      0.50      0.47     80224
weighted avg       0.81      0.90      0.85     80224

[[72230     0]
 [ 7994     0]]


Execution time: 0 seconds


title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=train_y, predicted_y=predicted_y)


end_time = time.time()
execution_time = end_time - start_time
print(f'Preprocessing time: {round(execution_time)} seconds')

Preprocessing time: 10 seconds


train_start = time.time()


models = {}


dr = DummyClassifier(strategy='most_frequent')
models['Dummy classifier'] = dr


lr = LogisticRegression(random_state=seed) #class_weight={0:1, 1:15}, 
models['Logistic regression'] = lr


nn = MLPClassifier(max_iter=300)
models['Neural net'] = nn


dt = DecisionTreeClassifier(random_state=seed) #max_depth=5, class_weight={0: 1, 1: 12}
models['Decision tree'] = dt


rf = RandomForestClassifier(random_state=seed) #class_weight={0: 1, 1: 12},
                                               #criterion='gini',
                                               #max_depth=13,
                                               #max_features='log2',
                                               #min_samples_leaf=10,
                                               #n_estimators=26,
                                               #n_jobs=-1,
models['Random forest'] = rf


gnb = GaussianNB()
models['Naive Bayes'] = gnb


knn = KNeighborsClassifier(n_neighbors=1, weights='distance')
models['K-nearest neighbors'] = knn


svm = SVC(C=0.1, kernel='poly', random_state=seed)
models['Support vector machine'] = svm


qda = QuadraticDiscriminantAnalysis()
models['Quadratic Discriminant Analysis'] = qda


abc = AdaBoostClassifier(random_state=seed) #n_estimators=100, 
models['Adaboost'] = qda


print('Models originally considered:')
for key in models.keys():
    print('-', key)

Models originally considered:
- Dummy classifier
- Logistic regression
- Neural net
- Decision tree
- Random forest
- Naive Bayes
- K-nearest neighbors
- Support vector machine
- Quadratic Discriminant Analysis
- Adaboost


# Combine the classifiers in the ensemble model
ensemble_model = VotingClassifier(estimators=[('lr', lr),
                                              ('nn', nn),
                                              ('dt', dt),
                                              ('rf', rf),
                                              ('knn', knn),
                                              ('svm', svm),
                                              ('qda', qda),
                                              ('abc', abc)],
                                              voting='hard')


modeling_start = time.time()


metric_columns = columns=['model', 'time_to_train', 'time_to_predict', 'total_time', 'accuracy', 'f1', 'recall']
metrics_train = metrics_valid = metrics_test = pd.DataFrame(columns=metric_columns)


train_start_time = time.time()


for model_name, model in models.items():
    metrics_train = train_and_test_model(metrics_df=metrics_train,
                                         train_x=train_x,
                                         train_y=train_y,
                                         test_x=train_x,
                                         test_y=train_y,
                                         model=model,
                                         model_name=model_name)


DUMMY CLASSIFIER

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9004
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     72230
           1       0.00      0.00      0.00      7994

    accuracy                           0.90     80224
   macro avg       0.45      0.50      0.47     80224
weighted avg       0.81      0.90      0.85     80224

[[72230     0]
 [ 7994     0]]


LOGISTIC REGRESSION

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.9114
F-score: 0.3774
Recall: 0.2695

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     72230
           1       0.63      0.27      0.38      7994

    accuracy                           0.91     80224
   macro avg       0.78      0.63      0.66     80224
weighted avg       0.89      0.91      0.90     80224

[[70962  1268]
 [ 5840  2154]]


NEURAL NET

Time to train: 38 seconds
Time to predict: 0 seconds
Total time: 38 seconds
Accuracy:) 0.9213
F-score: 0.4633
Recall: 0.3409

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     72230
           1       0.72      0.34      0.46      7994

    accuracy                           0.92     80224
   macro avg       0.83      0.66      0.71     80224
weighted avg       0.91      0.92      0.91     80224

[[71186  1044]
 [ 5269  2725]]


DECISION TREE

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9866
F-score: 0.9289
Recall: 0.88

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     72230
           1       0.98      0.88      0.93      7994

    accuracy                           0.99     80224
   macro avg       0.99      0.94      0.96     80224
weighted avg       0.99      0.99      0.99     80224

[[72112   118]
 [  959  7035]]


RANDOM FOREST

Time to train: 4 seconds
Time to predict: 1 seconds
Total time: 5 seconds
Accuracy:) 0.9865
F-score: 0.9303
Recall: 0.9049

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     72230
           1       0.96      0.90      0.93      7994

    accuracy                           0.99     80224
   macro avg       0.97      0.95      0.96     80224
weighted avg       0.99      0.99      0.99     80224

[[71906   324]
 [  760  7234]]


NAIVE BAYES

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.4603
F-score: 0.2554
Recall: 0.9288

              precision    recall  f1-score   support

           0       0.98      0.41      0.58     72230
           1       0.15      0.93      0.26      7994

    accuracy                           0.46     80224
   macro avg       0.56      0.67      0.42     80224
weighted avg       0.90      0.46      0.54     80224

[[29501 42729]
 [  569  7425]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 49 seconds
Total time: 49 seconds
Accuracy:) 0.9926
F-score: 0.9628
Recall: 0.9655

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     72230
           1       0.96      0.97      0.96      7994

    accuracy                           0.99     80224
   macro avg       0.98      0.98      0.98     80224
weighted avg       0.99      0.99      0.99     80224

[[71910   320]
 [  276  7718]]


SUPPORT VECTOR MACHINE

Time to train: 213 seconds
Time to predict: 29 seconds
Total time: 242 seconds
Accuracy:) 0.9092
F-score: 0.2378
Recall: 0.1422

              precision    recall  f1-score   support

           0       0.91      0.99      0.95     72230
           1       0.73      0.14      0.24      7994

    accuracy                           0.91     80224
   macro avg       0.82      0.57      0.59     80224
weighted avg       0.89      0.91      0.88     80224

[[71799   431]
 [ 6857  1137]]


QUADRATIC DISCRIMINANT ANALYSIS

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.3796
F-score: 0.2324
Recall: 0.9426

              precision    recall  f1-score   support

           0       0.98      0.32      0.48     72230
           1       0.13      0.94      0.23      7994

    accuracy                           0.38     80224
   macro avg       0.56      0.63      0.36     80224
weighted avg       0.90      0.38      0.45     80224

[[22922 49308]
 [  459  7535]]


ADABOOST

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.3796
F-score: 0.2324
Recall: 0.9426

              precision    recall  f1-score   support

           0       0.98      0.32      0.48     72230
           1       0.13      0.94      0.23      7994

    accuracy                           0.38     80224
   macro avg       0.56      0.63      0.36     80224
weighted avg       0.90      0.38      0.45     80224

[[22922 49308]
 [  459  7535]]


train_end_time = time.time()
train_execution_time = train_end_time - train_start_time
print(f'Train set execution time: {round(train_execution_time / 60)} minutes')

Train set execution time: 6 minutes


metrics_train.sort_values(by=['f1', 'time_to_predict'], ascending=False)


poor_models = metrics_train[metrics_train.f1 < 0.50].model


for model in poor_models:
    if model in models.keys():
        del models[model]
        
print('Models finally considered for training:')
for key in models.keys():
    print('-', key)

Models finally considered for training:
- Decision tree
- Random forest
- K-nearest neighbors


valid_start_time = time.time()


for model_name, model in models.items():
    metrics_valid = train_and_test_model(metrics_df=metrics_valid,
                                         train_x=train_x,
                                         train_y=train_y,
                                         test_x=valid_x,
                                         test_y=valid_y,
                                         model=model,
                                         model_name=model_name)


DECISION TREE

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.8769
F-score: 0.3845
Recall: 0.3871

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     35590
           1       0.38      0.39      0.38      3924

    accuracy                           0.88     39514
   macro avg       0.66      0.66      0.66     39514
weighted avg       0.88      0.88      0.88     39514

[[33131  2459]
 [ 2405  1519]]


RANDOM FOREST

Time to train: 4 seconds
Time to predict: 1 seconds
Total time: 5 seconds
Accuracy:) 0.9058
F-score: 0.4261
Recall: 0.3522

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     35590
           1       0.54      0.35      0.43      3924

    accuracy                           0.91     39514
   macro avg       0.74      0.66      0.69     39514
weighted avg       0.89      0.91      0.90     39514

[[34409  1181]
 [ 2542  1382]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 24 seconds
Total time: 24 seconds
Accuracy:) 0.8737
F-score: 0.3794
Recall: 0.3886

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     35590
           1       0.37      0.39      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.65      0.66      0.65     39514
weighted avg       0.88      0.87      0.88     39514

[[32999  2591]
 [ 2399  1525]]


valid_end_time = time.time()
valid_execution_time = valid_end_time - valid_start_time
print(f'Validation set execution time: {round(valid_execution_time / 60)} minutes')

Validation set execution time: 1 minutes


metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)


# fitting the model
model = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
model.fit(train_x, train_y)

# plotting feature importances
features = train_x_dummies.columns
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(10,15))
plt.title('Feature importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative importance')
plt.show()


smote_metrics_valid = pd.DataFrame(columns=metric_columns)


for model_name, model in models.items():
    smote_metrics_valid = train_and_test_model(metrics_df=smote_metrics_valid,
                                               train_x=smote_x,
                                               train_y=smote_y,
                                               test_x=valid_x,
                                               test_y=valid_y,
                                               model=model,
                                               model_name=model_name)


DECISION TREE

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.8679
F-score: 0.3763
Recall: 0.4011

              precision    recall  f1-score   support

           0       0.93      0.92      0.93     35590
           1       0.35      0.40      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.64      0.66      0.65     39514
weighted avg       0.88      0.87      0.87     39514

[[32722  2868]
 [ 2350  1574]]


RANDOM FOREST

Time to train: 9 seconds
Time to predict: 1 seconds
Total time: 10 seconds
Accuracy:) 0.8837
F-score: 0.4455
Recall: 0.4704

              precision    recall  f1-score   support

           0       0.94      0.93      0.94     35590
           1       0.42      0.47      0.45      3924

    accuracy                           0.88     39514
   macro avg       0.68      0.70      0.69     39514
weighted avg       0.89      0.88      0.89     39514

[[33073  2517]
 [ 2078  1846]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 43 seconds
Total time: 43 seconds
Accuracy:) 0.8489
F-score: 0.3869
Recall: 0.4801

              precision    recall  f1-score   support

           0       0.94      0.89      0.91     35590
           1       0.32      0.48      0.39      3924

    accuracy                           0.85     39514
   macro avg       0.63      0.68      0.65     39514
weighted avg       0.88      0.85      0.86     39514

[[31658  3932]
 [ 2040  1884]]


smote_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)


adasyn_metrics_valid = pd.DataFrame(columns=metric_columns)


for model_name, model in models.items():
    adasyn_metrics_valid = train_and_test_model(metrics_df=adasyn_metrics_valid,
                                                 train_x=adasyn_x,
                                                 train_y=adasyn_y,
                                                 test_x=valid_x,
                                                 test_y=valid_y,
                                                 model=model,
                                                 model_name=model_name)


DECISION TREE

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.8698
F-score: 0.3778
Recall: 0.3981

              precision    recall  f1-score   support

           0       0.93      0.92      0.93     35590
           1       0.36      0.40      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.65      0.66      0.65     39514
weighted avg       0.88      0.87      0.87     39514

[[32807  2783]
 [ 2362  1562]]


RANDOM FOREST

Time to train: 10 seconds
Time to predict: 1 seconds
Total time: 11 seconds
Accuracy:) 0.8828
F-score: 0.4471
Recall: 0.4771

              precision    recall  f1-score   support

           0       0.94      0.93      0.93     35590
           1       0.42      0.48      0.45      3924

    accuracy                           0.88     39514
   macro avg       0.68      0.70      0.69     39514
weighted avg       0.89      0.88      0.89     39514

[[33012  2578]
 [ 2052  1872]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 44 seconds
Total time: 44 seconds
Accuracy:) 0.8455
F-score: 0.3852
Recall: 0.4875

              precision    recall  f1-score   support

           0       0.94      0.88      0.91     35590
           1       0.32      0.49      0.39      3924

    accuracy                           0.85     39514
   macro avg       0.63      0.69      0.65     39514
weighted avg       0.88      0.85      0.86     39514

[[31495  4095]
 [ 2011  1913]]


fig = plt.figure(figsize=(15, 10))
plot_tree(dt, 
          feature_names=train_x_dummies.columns,
          class_names=['0', '1'], 
          filled=True, impurity=True, 
          rounded=True,
          max_depth=5
         )
plt.show()


adasyn_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)


tuning_start = time.time()


rf = RandomForestClassifier()

param_grid = {'class_weight': ['balanced'],
              'n_estimators': [10,100,500,700],
              'max_features': ['auto','log2'],
              'max_depth' : [10, 100],
              'min_samples_leaf': [0.001, 0.01, 0.05],
              'criterion': ['gini', 'entropy']
             }

#scorer = {'F1': make_scorer(f1_score)}

grid_rf = GridSearchCV(rf, param_grid=param_grid, scoring='f1', cv=5, verbose=1)
grid_rf.fit(train_x, train_y)

print(grid_rf.best_params_)

prediction = grid_rf.predict(valid_x)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 0.001, 'n_estimators': 500}


grid_rf.best_score_

0.44402874570737405


tuning_end = time.time()
tuning_execution_time = tuning_end - tuning_start
print()


ConfusionMatrixDisplay.from_predictions(valid_y, prediction, cmap='winter')
plt.title('Confusion Matrix - Random Forest with SMOTE (testing data)')
#plt.savefig(fname='011.png', format='png', dpi=400)
plt.show()


rf = RandomForestClassifier(class_weight='balanced',
                            criterion='gini',
                            max_depth=100, max_features='auto',
                            min_samples_leaf=0.001,
                            n_estimators=300)


final_metrics = pd.DataFrame(columns=metric_columns)


final_metrics = train_and_test_model(metrics_df=final_metrics,
                                     train_x=train_x,
                                     train_y=train_y,
                                     test_x=test_x,
                                     test_y=test_y,
                                     model=rf,
                                     model_name='Random Forest')


RANDOM FOREST

Time to train: 6 seconds
Time to predict: 1 seconds
Total time: 7 seconds
Accuracy:) 0.7968
F-score: 0.4424
Recall: 0.8166

              precision    recall  f1-score   support

           0       0.98      0.79      0.88     54217
           1       0.30      0.82      0.44      5938

    accuracy                           0.80     60155
   macro avg       0.64      0.81      0.66     60155
weighted avg       0.91      0.80      0.83     60155

[[43083 11134]
 [ 1089  4849]]


end_notebook = time.time()
print(f'Analysis took {round((end_notebook - start_notebook) / 60)} minutes to run.')

Analysis took 38 minutes to run.

model	time_to_train	time_to_predict	total_time	accuracy	f1	recall
K-nearest neighbors	0	49	49	0.992571	0.962824	0.965474
Random forest	4	1	5	0.986488	0.930298	0.904929
Decision tree	0	0	0	0.986575	0.928897	0.880035
Neural net	38	0	38	0.921308	0.463317	0.340881
Logistic regression	1	0	1	0.911398	0.377365	0.269452
Naive Bayes	0	0	0	0.460286	0.255383	0.928822
Support vector machine	213	29	242	0.909154	0.237816	0.142232
Quadratic Discriminant Analysis	0	0	0	0.379649	0.232429	0.942582
Adaboost	0	0	0	0.379649	0.232429	0.942582
Dummy classifier	0	0	0	0.900354	0.0	0.0

Set up¶

Cleaning¶

Preprocessing¶

Dummy regressor¶

Get dummies¶

Scale¶

Split¶

SMOTE¶

ADASYN¶

Dummy regressor¶

Modeling¶

Models¶

Dummy classifier¶

Logistic regression¶

Neural net¶

Decision Tree¶

Random forest¶

Naive Bayes¶

K-nearest neighbors¶

Support vector machine¶

Quadratic discriminant analysis¶

Adaboost¶

Pipeline¶

Evaluating model performance¶

Train¶

Validate¶

Try to deal with imbalance: SMOTE¶

Try to deal with imbalance: ADASYN¶

Hyperparameter tuning¶

Test¶

Conclusions¶

End¶

model	time_to_train	time_to_predict	total_time	accuracy	f1	recall
Random forest	4	1	5	0.90578	0.426083	0.352192
Decision tree	0	0	0	0.876904	0.38446	0.387105
K-nearest neighbors	0	24	24	0.873716	0.379353	0.388634

model	time_to_train	time_to_predict	total_time	accuracy	f1	recall
Random forest	9	1	10	0.883712	0.445517	0.470438
K-nearest neighbors	0	43	43	0.848864	0.386858	0.480122
Decision tree	1	0	1	0.867946	0.376285	0.401121

model	time_to_train	time_to_predict	total_time	accuracy	f1	recall
Random forest	10	1	11	0.882826	0.447098	0.477064
K-nearest neighbors	0	44	44	0.845472	0.385219	0.487513
Decision tree	1	0	1	0.869793	0.377797	0.398063