Set up¶

In [1]:
from helpers.clean import *
from helpers.eda import *
from helpers.model import *
from helpers.evaluate import * 

import time
start_notebook = time.time() 

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import ADASYN, \
                                   SMOTE

from sklearn.ensemble import AdaBoostClassifier, \
                             RandomForestClassifier, \
                             VotingClassifier

from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, \
                         plot_tree

from sklearn.metrics import accuracy_score, \
                            classification_report, \
                            confusion_matrix, \
                            ConfusionMatrixDisplay, \
                            f1_score, \
                            make_scorer, \
                            recall_score, \
                            roc_curve, \
                            precision_score

from sklearn.model_selection import cross_val_score, \
                                    train_test_split, \
                                    GridSearchCV    

seed = 42

teal = '#01ACA5'
purple = '#BF40BF'
sns.set_palette(sns.color_palette([teal, purple]))
white_median = dict(color='yellow')

pd.set_option('display.max_columns', 9)

import warnings
warnings.filterwarnings('ignore')
In [2]:
header = ['age', 'class_of_worker', 'industry_code', 'occupation_code', 'education', 'wage_per_hour',
          'enrolled_in_edu_inst_last_wk', 'marital_status', 'major_industry_code',
          'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union',
          'reason_for_unemployment', 'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses',
          'divdends_from_stocks', 'tax_filer_status', 'region_of_previous_residence',
          'state_of_previous_residence', 'detailed_household_and_family_stat',
          'detailed_household_summary_in_household', 'unknown_column', 'migration_code_change_in_msa',
          'migration_code_change_in_reg', 'migration_code_move_within_reg', 'live_in_this_house_1_year_ago',
          'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18',
          'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self',
          'citizenship', 'own_business_or_self_employed', 'fill_inc_questionnaire_for_veterans_admin',
          'veterans_benefits', 'weeks_worked_in_year', 'year', 'taxable_income_amount']

columns_to_drop_eda = ['weeks_worked_in_year',
                       'industry_code',
                       'hispanic_origin',
                       'detailed_household_and_family_stat',
                       'country_of_birth_father',
                       'country_of_birth_mother',
                       'country_of_birth_self',
                       'own_business_or_self_employed',
                       'veterans_benefits',
                       'year']

start_time = time.time()

raw_train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
raw_test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')

train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')

end_time = time.time()
execution_time = end_time - start_time
print(f'Loading time: {round(execution_time)} seconds')
Loading time: 9 seconds

Cleaning¶

In [3]:
start_time = time.time()

train = clean_df(train, *columns_to_drop_eda)
test = clean_df(test, *columns_to_drop_eda)

end_time = time.time()
execution_time = end_time - start_time
print(f'Cleaning time: {round(execution_time)} seconds')
Cleaning time: 8 seconds
In [4]:
get_cleaning_metrics(raw_train, train, raw_test, test, type_of_test_set='Test')
79785 observations removed from the train set.
39607 observations removed from the Test set.
30 columns removed from the train set.
30 columns removed from the Test set.

Train dataset observations reduced by 40.0%.
Test dataset observations reduced by 39.7%.
Train dataset columns reduced by 71.4%.
Test dataset columns reduced by 71.4%.

The training set has 119738 observations and  12 columns.
The training set has 60155 observations and  12 columns.
In [5]:
fig = plot_imbalance(train)
plt.show()

Preprocessing¶

In [6]:
start_time = time.time()
In [7]:
train_x, train_y =  train.drop('taxable_income_amount', axis=1), train.taxable_income_amount
test_x, test_y = test.drop('taxable_income_amount', axis=1), test.taxable_income_amount

Dummy regressor¶

In [8]:
dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, test_x, test_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')
Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9013
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     54217
           1       0.00      0.00      0.00      5938

    accuracy                           0.90     60155
   macro avg       0.45      0.50      0.47     60155
weighted avg       0.81      0.90      0.85     60155

[[54217     0]
 [ 5938     0]]


Execution time: 0 seconds
In [9]:
title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=test_y, predicted_y=predicted_y)

Get dummies¶

In [10]:
train_x_dummies = pd.get_dummies(train_x)
test_x_dummies = pd.get_dummies(test_x)

Scale¶

In [11]:
train_scaler = StandardScaler().fit(train_x_dummies)
train_x_scaled = train_scaler.transform(train_x_dummies)

test_scaler = StandardScaler().fit(test_x_dummies)
test_x_scaled = test_scaler.transform(test_x_dummies)

Split¶

In [12]:
train_x_split, valid_x, train_y_split, valid_y = train_test_split(train_x_scaled,
                                                                  train_y,
                                                                  test_size=0.33,
                                                                  random_state=42)

train_x = train_x_split
train_y = train_y_split
test_x = test_x_scaled
In [13]:
fig = plot_imbalance(test)
plt.show()
In [14]:
test.taxable_income_amount.value_counts(normalize=True)
Out[14]:
0    0.901288
1    0.098712
Name: taxable_income_amount, dtype: float64

SMOTE¶

In [15]:
model = SMOTE()
smote_x, smote_y = model.fit_resample(train_x, train_y)
smote_y = smote_y.astype('int')

ADASYN¶

In [16]:
adasyn = ADASYN(random_state=42)
adasyn_x, adasyn_y = adasyn.fit_resample(train_x, train_y)

Dummy regressor¶

In [17]:
dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, train_x, train_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')
Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9004
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     72230
           1       0.00      0.00      0.00      7994

    accuracy                           0.90     80224
   macro avg       0.45      0.50      0.47     80224
weighted avg       0.81      0.90      0.85     80224

[[72230     0]
 [ 7994     0]]


Execution time: 0 seconds
In [18]:
title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=train_y, predicted_y=predicted_y)
In [19]:
end_time = time.time()
execution_time = end_time - start_time
print(f'Preprocessing time: {round(execution_time)} seconds')
Preprocessing time: 10 seconds

Modeling¶

Models¶

In [20]:
train_start = time.time()
In [21]:
models = {}

Dummy classifier¶

In [22]:
dr = DummyClassifier(strategy='most_frequent')
models['Dummy classifier'] = dr

Logistic regression¶

In [23]:
lr = LogisticRegression(random_state=seed) #class_weight={0:1, 1:15}, 
models['Logistic regression'] = lr

Neural net¶

In [24]:
nn = MLPClassifier(max_iter=300)
models['Neural net'] = nn

Decision Tree¶

In [25]:
dt = DecisionTreeClassifier(random_state=seed) #max_depth=5, class_weight={0: 1, 1: 12}
models['Decision tree'] = dt

Random forest¶

In [26]:
rf = RandomForestClassifier(random_state=seed) #class_weight={0: 1, 1: 12},
                                               #criterion='gini',
                                               #max_depth=13,
                                               #max_features='log2',
                                               #min_samples_leaf=10,
                                               #n_estimators=26,
                                               #n_jobs=-1,
models['Random forest'] = rf

Naive Bayes¶

In [27]:
gnb = GaussianNB()
models['Naive Bayes'] = gnb

K-nearest neighbors¶

In [28]:
knn = KNeighborsClassifier(n_neighbors=1, weights='distance')
models['K-nearest neighbors'] = knn

Support vector machine¶

In [29]:
svm = SVC(C=0.1, kernel='poly', random_state=seed)
models['Support vector machine'] = svm

Quadratic discriminant analysis¶

In [30]:
qda = QuadraticDiscriminantAnalysis()
models['Quadratic Discriminant Analysis'] = qda

Adaboost¶

In [31]:
abc = AdaBoostClassifier(random_state=seed) #n_estimators=100, 
models['Adaboost'] = qda
In [32]:
print('Models originally considered:')
for key in models.keys():
    print('-', key)
Models originally considered:
- Dummy classifier
- Logistic regression
- Neural net
- Decision tree
- Random forest
- Naive Bayes
- K-nearest neighbors
- Support vector machine
- Quadratic Discriminant Analysis
- Adaboost

Pipeline¶

In [33]:
# Combine the classifiers in the ensemble model
ensemble_model = VotingClassifier(estimators=[('lr', lr),
                                              ('nn', nn),
                                              ('dt', dt),
                                              ('rf', rf),
                                              ('knn', knn),
                                              ('svm', svm),
                                              ('qda', qda),
                                              ('abc', abc)],
                                              voting='hard')

Evaluating model performance¶

In [34]:
modeling_start = time.time()
In [35]:
metric_columns = columns=['model', 'time_to_train', 'time_to_predict', 'total_time', 'accuracy', 'f1', 'recall']
metrics_train = metrics_valid = metrics_test = pd.DataFrame(columns=metric_columns)

Train¶

In [36]:
train_start_time = time.time()
In [37]:
for model_name, model in models.items():
    metrics_train = train_and_test_model(metrics_df=metrics_train,
                                         train_x=train_x,
                                         train_y=train_y,
                                         test_x=train_x,
                                         test_y=train_y,
                                         model=model,
                                         model_name=model_name)

DUMMY CLASSIFIER

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9004
F-score: 0.0
Recall: 0.0

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     72230
           1       0.00      0.00      0.00      7994

    accuracy                           0.90     80224
   macro avg       0.45      0.50      0.47     80224
weighted avg       0.81      0.90      0.85     80224

[[72230     0]
 [ 7994     0]]


LOGISTIC REGRESSION

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.9114
F-score: 0.3774
Recall: 0.2695

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     72230
           1       0.63      0.27      0.38      7994

    accuracy                           0.91     80224
   macro avg       0.78      0.63      0.66     80224
weighted avg       0.89      0.91      0.90     80224

[[70962  1268]
 [ 5840  2154]]


NEURAL NET

Time to train: 38 seconds
Time to predict: 0 seconds
Total time: 38 seconds
Accuracy:) 0.9213
F-score: 0.4633
Recall: 0.3409

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     72230
           1       0.72      0.34      0.46      7994

    accuracy                           0.92     80224
   macro avg       0.83      0.66      0.71     80224
weighted avg       0.91      0.92      0.91     80224

[[71186  1044]
 [ 5269  2725]]


DECISION TREE

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.9866
F-score: 0.9289
Recall: 0.88

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     72230
           1       0.98      0.88      0.93      7994

    accuracy                           0.99     80224
   macro avg       0.99      0.94      0.96     80224
weighted avg       0.99      0.99      0.99     80224

[[72112   118]
 [  959  7035]]


RANDOM FOREST

Time to train: 4 seconds
Time to predict: 1 seconds
Total time: 5 seconds
Accuracy:) 0.9865
F-score: 0.9303
Recall: 0.9049

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     72230
           1       0.96      0.90      0.93      7994

    accuracy                           0.99     80224
   macro avg       0.97      0.95      0.96     80224
weighted avg       0.99      0.99      0.99     80224

[[71906   324]
 [  760  7234]]


NAIVE BAYES

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.4603
F-score: 0.2554
Recall: 0.9288

              precision    recall  f1-score   support

           0       0.98      0.41      0.58     72230
           1       0.15      0.93      0.26      7994

    accuracy                           0.46     80224
   macro avg       0.56      0.67      0.42     80224
weighted avg       0.90      0.46      0.54     80224

[[29501 42729]
 [  569  7425]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 49 seconds
Total time: 49 seconds
Accuracy:) 0.9926
F-score: 0.9628
Recall: 0.9655

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     72230
           1       0.96      0.97      0.96      7994

    accuracy                           0.99     80224
   macro avg       0.98      0.98      0.98     80224
weighted avg       0.99      0.99      0.99     80224

[[71910   320]
 [  276  7718]]


SUPPORT VECTOR MACHINE

Time to train: 213 seconds
Time to predict: 29 seconds
Total time: 242 seconds
Accuracy:) 0.9092
F-score: 0.2378
Recall: 0.1422

              precision    recall  f1-score   support

           0       0.91      0.99      0.95     72230
           1       0.73      0.14      0.24      7994

    accuracy                           0.91     80224
   macro avg       0.82      0.57      0.59     80224
weighted avg       0.89      0.91      0.88     80224

[[71799   431]
 [ 6857  1137]]


QUADRATIC DISCRIMINANT ANALYSIS

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.3796
F-score: 0.2324
Recall: 0.9426

              precision    recall  f1-score   support

           0       0.98      0.32      0.48     72230
           1       0.13      0.94      0.23      7994

    accuracy                           0.38     80224
   macro avg       0.56      0.63      0.36     80224
weighted avg       0.90      0.38      0.45     80224

[[22922 49308]
 [  459  7535]]


ADABOOST

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.3796
F-score: 0.2324
Recall: 0.9426

              precision    recall  f1-score   support

           0       0.98      0.32      0.48     72230
           1       0.13      0.94      0.23      7994

    accuracy                           0.38     80224
   macro avg       0.56      0.63      0.36     80224
weighted avg       0.90      0.38      0.45     80224

[[22922 49308]
 [  459  7535]]

In [38]:
train_end_time = time.time()
train_execution_time = train_end_time - train_start_time
print(f'Train set execution time: {round(train_execution_time / 60)} minutes')
Train set execution time: 6 minutes
In [39]:
metrics_train.sort_values(by=['f1', 'time_to_predict'], ascending=False)
Out[39]:
model time_to_train time_to_predict total_time accuracy f1 recall
0 K-nearest neighbors 0 49 49 0.992571 0.962824 0.965474
0 Random forest 4 1 5 0.986488 0.930298 0.904929
0 Decision tree 0 0 0 0.986575 0.928897 0.880035
0 Neural net 38 0 38 0.921308 0.463317 0.340881
0 Logistic regression 1 0 1 0.911398 0.377365 0.269452
0 Naive Bayes 0 0 0 0.460286 0.255383 0.928822
0 Support vector machine 213 29 242 0.909154 0.237816 0.142232
0 Quadratic Discriminant Analysis 0 0 0 0.379649 0.232429 0.942582
0 Adaboost 0 0 0 0.379649 0.232429 0.942582
0 Dummy classifier 0 0 0 0.900354 0.0 0.0

We can already identify models that perform poorly on the very dataset they were trained upon. Let's get rid of those.

In [40]:
poor_models = metrics_train[metrics_train.f1 < 0.50].model

So our efforts will most probably focus on obtaining a good decision tree, and then building a random forest on top of it to mitigate overfitting risks.

In [41]:
for model in poor_models:
    if model in models.keys():
        del models[model]
        
print('Models finally considered for training:')
for key in models.keys():
    print('-', key)
Models finally considered for training:
- Decision tree
- Random forest
- K-nearest neighbors

We can now consider:

  • a K-Nearest Neighbors:
    • time to train: almost instantaneous
    • time to predict: about 2 minutes
    • F1 score: best
    • explainability: easy
    • risk: overfitting
  • a Random Forest:
    • time to train: about 10 seconds
    • time to predict: about 2 seconds
    • F1 score: very good
    • explainability: easy
    • risk: overfitting
  • a Decision Tree:
    • time to train: 1 second
    • time to predict: almost instantaneous
    • F1 score: very good
    • explainability: easy
    • risk: overfitting practically guaranteed
  • a Neural Net:
    • time to train: about 2 minutes
    • time to predict: almost instantaneous
    • F1 score: not great
    • explainability: black box
    • risk: not understanding the model

Based on this assessment above, let's give a chance to these algorithms on the validation set

Validate¶

In [42]:
valid_start_time = time.time()
In [43]:
for model_name, model in models.items():
    metrics_valid = train_and_test_model(metrics_df=metrics_valid,
                                         train_x=train_x,
                                         train_y=train_y,
                                         test_x=valid_x,
                                         test_y=valid_y,
                                         model=model,
                                         model_name=model_name)

DECISION TREE

Time to train: 0 seconds
Time to predict: 0 seconds
Total time: 0 seconds
Accuracy:) 0.8769
F-score: 0.3845
Recall: 0.3871

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     35590
           1       0.38      0.39      0.38      3924

    accuracy                           0.88     39514
   macro avg       0.66      0.66      0.66     39514
weighted avg       0.88      0.88      0.88     39514

[[33131  2459]
 [ 2405  1519]]


RANDOM FOREST

Time to train: 4 seconds
Time to predict: 1 seconds
Total time: 5 seconds
Accuracy:) 0.9058
F-score: 0.4261
Recall: 0.3522

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     35590
           1       0.54      0.35      0.43      3924

    accuracy                           0.91     39514
   macro avg       0.74      0.66      0.69     39514
weighted avg       0.89      0.91      0.90     39514

[[34409  1181]
 [ 2542  1382]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 24 seconds
Total time: 24 seconds
Accuracy:) 0.8737
F-score: 0.3794
Recall: 0.3886

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     35590
           1       0.37      0.39      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.65      0.66      0.65     39514
weighted avg       0.88      0.87      0.88     39514

[[32999  2591]
 [ 2399  1525]]

In [44]:
valid_end_time = time.time()
valid_execution_time = valid_end_time - valid_start_time
print(f'Validation set execution time: {round(valid_execution_time / 60)} minutes')
Validation set execution time: 1 minutes
In [45]:
metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
Out[45]:
model time_to_train time_to_predict total_time accuracy f1 recall
0 Random forest 4 1 5 0.90578 0.426083 0.352192
0 Decision tree 0 0 0 0.876904 0.38446 0.387105
0 K-nearest neighbors 0 24 24 0.873716 0.379353 0.388634

The models we considered all lost about 0.5 points on their F1 score. That's a severe hit. Before we try to improve them individually, let's see if an additional manipulation on the dataset, to deal with its imbalance, would improve our results.

Also, KNN is now our worst performing model, and neural net our best performing one. Random Forest is close on its tail though, so it definitely looks like a combination of work on a decision tree and a random forest would be the way to go.

Out of curiosity, what are the most important features recommended by a Random Forest model?

In [46]:
# fitting the model
model = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
model.fit(train_x, train_y)

# plotting feature importances
features = train_x_dummies.columns
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(10,15))
plt.title('Feature importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative importance')
plt.show()

Try to deal with imbalance: SMOTE¶

In [47]:
smote_metrics_valid = pd.DataFrame(columns=metric_columns)
In [48]:
for model_name, model in models.items():
    smote_metrics_valid = train_and_test_model(metrics_df=smote_metrics_valid,
                                               train_x=smote_x,
                                               train_y=smote_y,
                                               test_x=valid_x,
                                               test_y=valid_y,
                                               model=model,
                                               model_name=model_name)

DECISION TREE

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.8679
F-score: 0.3763
Recall: 0.4011

              precision    recall  f1-score   support

           0       0.93      0.92      0.93     35590
           1       0.35      0.40      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.64      0.66      0.65     39514
weighted avg       0.88      0.87      0.87     39514

[[32722  2868]
 [ 2350  1574]]


RANDOM FOREST

Time to train: 9 seconds
Time to predict: 1 seconds
Total time: 10 seconds
Accuracy:) 0.8837
F-score: 0.4455
Recall: 0.4704

              precision    recall  f1-score   support

           0       0.94      0.93      0.94     35590
           1       0.42      0.47      0.45      3924

    accuracy                           0.88     39514
   macro avg       0.68      0.70      0.69     39514
weighted avg       0.89      0.88      0.89     39514

[[33073  2517]
 [ 2078  1846]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 43 seconds
Total time: 43 seconds
Accuracy:) 0.8489
F-score: 0.3869
Recall: 0.4801

              precision    recall  f1-score   support

           0       0.94      0.89      0.91     35590
           1       0.32      0.48      0.39      3924

    accuracy                           0.85     39514
   macro avg       0.63      0.68      0.65     39514
weighted avg       0.88      0.85      0.86     39514

[[31658  3932]
 [ 2040  1884]]

In [49]:
smote_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
Out[49]:
model time_to_train time_to_predict total_time accuracy f1 recall
0 Random forest 9 1 10 0.883712 0.445517 0.470438
0 K-nearest neighbors 0 43 43 0.848864 0.386858 0.480122
0 Decision tree 1 0 1 0.867946 0.376285 0.401121

We don't see any improvement here. Let's try with ADASYN.

Try to deal with imbalance: ADASYN¶

In [50]:
adasyn_metrics_valid = pd.DataFrame(columns=metric_columns)
In [51]:
for model_name, model in models.items():
    adasyn_metrics_valid = train_and_test_model(metrics_df=adasyn_metrics_valid,
                                                 train_x=adasyn_x,
                                                 train_y=adasyn_y,
                                                 test_x=valid_x,
                                                 test_y=valid_y,
                                                 model=model,
                                                 model_name=model_name)

DECISION TREE

Time to train: 1 seconds
Time to predict: 0 seconds
Total time: 1 seconds
Accuracy:) 0.8698
F-score: 0.3778
Recall: 0.3981

              precision    recall  f1-score   support

           0       0.93      0.92      0.93     35590
           1       0.36      0.40      0.38      3924

    accuracy                           0.87     39514
   macro avg       0.65      0.66      0.65     39514
weighted avg       0.88      0.87      0.87     39514

[[32807  2783]
 [ 2362  1562]]


RANDOM FOREST

Time to train: 10 seconds
Time to predict: 1 seconds
Total time: 11 seconds
Accuracy:) 0.8828
F-score: 0.4471
Recall: 0.4771

              precision    recall  f1-score   support

           0       0.94      0.93      0.93     35590
           1       0.42      0.48      0.45      3924

    accuracy                           0.88     39514
   macro avg       0.68      0.70      0.69     39514
weighted avg       0.89      0.88      0.89     39514

[[33012  2578]
 [ 2052  1872]]


K-NEAREST NEIGHBORS

Time to train: 0 seconds
Time to predict: 44 seconds
Total time: 44 seconds
Accuracy:) 0.8455
F-score: 0.3852
Recall: 0.4875

              precision    recall  f1-score   support

           0       0.94      0.88      0.91     35590
           1       0.32      0.49      0.39      3924

    accuracy                           0.85     39514
   macro avg       0.63      0.69      0.65     39514
weighted avg       0.88      0.85      0.86     39514

[[31495  4095]
 [ 2011  1913]]

Let's also have a look at the decision tree we built earlier:

In [52]:
fig = plt.figure(figsize=(15, 10))
plot_tree(dt, 
          feature_names=train_x_dummies.columns,
          class_names=['0', '1'], 
          filled=True, impurity=True, 
          rounded=True,
          max_depth=5
         )
plt.show()

Now let's look at our models' performance:

In [53]:
adasyn_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
Out[53]:
model time_to_train time_to_predict total_time accuracy f1 recall
0 Random forest 10 1 11 0.882826 0.447098 0.477064
0 K-nearest neighbors 0 44 44 0.845472 0.385219 0.487513
0 Decision tree 1 0 1 0.869793 0.377797 0.398063

No improvement either. We'd like to avoid PCA, since we'd lose in explainability, so let's try some simple hyparameter tuning first.

Hyperparameter tuning¶

In [54]:
tuning_start = time.time()
In [55]:
rf = RandomForestClassifier()

param_grid = {'class_weight': ['balanced'],
              'n_estimators': [10,100,500,700],
              'max_features': ['auto','log2'],
              'max_depth' : [10, 100],
              'min_samples_leaf': [0.001, 0.01, 0.05],
              'criterion': ['gini', 'entropy']
             }

#scorer = {'F1': make_scorer(f1_score)}

grid_rf = GridSearchCV(rf, param_grid=param_grid, scoring='f1', cv=5, verbose=1)
grid_rf.fit(train_x, train_y)

print(grid_rf.best_params_)

prediction = grid_rf.predict(valid_x)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 0.001, 'n_estimators': 500}
In [56]:
grid_rf.best_score_
Out[56]:
0.44402874570737405
In [57]:
tuning_end = time.time()
tuning_execution_time = tuning_end - tuning_start
print()

In [58]:
ConfusionMatrixDisplay.from_predictions(valid_y, prediction, cmap='winter')
plt.title('Confusion Matrix - Random Forest with SMOTE (testing data)')
#plt.savefig(fname='011.png', format='png', dpi=400)
plt.show()

Let's keep our best performing model so far:

In [59]:
rf = RandomForestClassifier(class_weight='balanced',
                            criterion='gini',
                            max_depth=100, max_features='auto',
                            min_samples_leaf=0.001,
                            n_estimators=300)

Test¶

In [60]:
final_metrics = pd.DataFrame(columns=metric_columns)
In [61]:
final_metrics = train_and_test_model(metrics_df=final_metrics,
                                     train_x=train_x,
                                     train_y=train_y,
                                     test_x=test_x,
                                     test_y=test_y,
                                     model=rf,
                                     model_name='Random Forest')

RANDOM FOREST

Time to train: 6 seconds
Time to predict: 1 seconds
Total time: 7 seconds
Accuracy:) 0.7968
F-score: 0.4424
Recall: 0.8166

              precision    recall  f1-score   support

           0       0.98      0.79      0.88     54217
           1       0.30      0.82      0.44      5938

    accuracy                           0.80     60155
   macro avg       0.64      0.81      0.66     60155
weighted avg       0.91      0.80      0.83     60155

[[43083 11134]
 [ 1089  4849]]

Conclusions¶

The model we selected in the end provides a satisfying score, with a AUC of 0.81. However, the metric remains optimistic as the are still some important misclassification errors. The good news is that the performance of the model is on par with its performance on the test set, making it reliable for scalability purposes.

End¶

In [62]:
end_notebook = time.time()
print(f'Analysis took {round((end_notebook - start_notebook) / 60)} minutes to run.')
Analysis took 38 minutes to run.