Supervised - Classification

This document contains the details of end to end code for each and every step in the building a supervised regression or a time series model using any of the following algorithms. - Logistic Regression - Linear Discriminant Analysis - K Nearest Neighbors - Decision Tree (CART) - Support Vector Machine - Ada Boost - Gradient Boosting Method - Random Forest - Extra Trees - Neural Network - Shallow - Deep Neural Network

2. Getting Started- Loading the data and python packages

## 2.1. Loading the python packages

   # Load libraries
   import numpy as np
   import pandas as pd
   from matplotlib import pyplot
   from pandas import read_csv, set_option
   from pandas.plotting import scatter_matrix
   import seaborn as sns
   from sklearn.preprocessing import StandardScaler
   from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
   from sklearn.linear_model import LogisticRegression
   from sklearn.tree import DecisionTreeClassifier
   from sklearn.neighbors import KNeighborsClassifier
   from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
   from sklearn.naive_bayes import GaussianNB
   from sklearn.svm import SVC
   from sklearn.neural_network import MLPClassifier
   from sklearn.pipeline import Pipeline
   from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
   from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

   #Libraries for Deep Learning Models
   from keras.models import Sequential
   from keras.layers import Dense
   from keras.wrappers.scikit_learn import KerasClassifier
   from keras.optimizers import SGD

   #Libraries for Saving the Model
   from pickle import dump
   from pickle import load

## 2.2. Loading the Data
# load dataset
dataset = read_csv('german_credit_data.csv')
#Diable the warnings
import warnings
warnings.filterwarnings('ignore')
type(dataset)
pandas.core.frame.DataFrame

3. Exploratory Data Analysis

## 3.1. Descriptive Statistics

# shape
dataset.shape
(1000, 10)
# peek at data
set_option('display.width', 100)
dataset.head()
# types
set_option('display.max_rows', 500)
dataset.dtypes
Age                 int64
Sex                object
Job                 int64
Housing            object
SavingAccounts     object
CheckingAccount    object
CreditAmount        int64
Duration            int64
Purpose            object
Risk               object
dtype: object
# describe data
set_option('precision', 3)
dataset.describe()
# class distribution
dataset.groupby('Housing').size()
   Housing
   free    108
   own     713
   rent    179
   dtype: int64



## 3.2. Data Visualization
# histograms
dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, figsize=(12,12))
pyplot.show()
AIType/output_20_0.png
# density
dataset.plot(kind='density', subplots=True, layout=(3,3), sharex=False, legend=True, fontsize=1, figsize=(15,15))
pyplot.show()
AIType/output_21_0.png
#Box and Whisker Plots
dataset.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(15,15))
pyplot.show()
AIType/output_22_0.png
# correlation
correlation = dataset.corr()
pyplot.figure(figsize=(15,15))
pyplot.title('Correlation Matrix')
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='cubehelix')
<matplotlib.axes._subplots.AxesSubplot at 0x139ec1fa6a0>
AIType/output_23_1.png
# Scatterplot Matrix
from pandas.plotting import scatter_matrix
pyplot.figure(figsize=(15,15))
scatter_matrix(dataset,figsize=(12,12))
pyplot.show()
<Figure size 1080x1080 with 0 Axes>
AIType/output_24_1.png

4. Data Preparation

## 4.1. Data Cleaning Check for the NAs in the rows, either drop them

or fill them with the mean of the column

#Checking for any null values and removing the null values'''
print('Null Values =',dataset.isnull().values.any())
Null Values = True

Given that there are null values drop the rown contianing the null values.

   # Drop the rows containing NA
   dataset = dataset.dropna(axis=0)
   # Fill na with 0
   #dataset.fillna('0')

   #Filling the NAs with the mean of the column.
   #dataset['col'] = dataset['col'].fillna(dataset['col'].mean())

## 4.2. Handling Categorical Data
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
dataset["Sex_Code"] = lb_make.fit_transform(dataset["Sex"])
dataset["Housing_Code"] = lb_make.fit_transform(dataset["Housing"])
dataset["SavingAccount_Code"] = lb_make.fit_transform(dataset["SavingAccounts"].fillna('0'))
dataset["CheckingAccount_Code"] = lb_make.fit_transform(dataset["CheckingAccount"].fillna('0'))
dataset["Purpose_Code"] = lb_make.fit_transform(dataset["Purpose"])
dataset["Risk_Code"] = lb_make.fit_transform(dataset["Risk"])
dataset[["Sex", "Sex_Code","Housing","Housing_Code","Risk_Code","Risk"]].head(10)

## 4.3. Feature Selection Statistical tests can be used to select those features that have the strongest relationship with the output variable.The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features. The example below uses the chi-squared (chi²) statistical test for non-negative features to select 10 of the best features from the Dataset.

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=5)
bestfeatures
SelectKBest(k=5, score_func=<function chi2 at 0x00000139EC248B70>)
Y= dataset["Risk_Code"]
X = dataset.loc[:, dataset.columns != 'Risk_Code']
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features
                  Specs      Score
2          CreditAmount  45853.601
3              Duration    327.508
6    SavingAccount_Code     14.395
7  CheckingAccount_Code      7.096
0                   Age      6.534
8          Purpose_Code      1.902
4              Sex_Code      0.671
1                   Job      0.318
5          Housing_Code      0.007

As it can be seem from the numbers above Credit Amount is the most important feature followed by duration.

## 4.4. Data Transformation

### 4.4.1. Rescale Data When your data is comprised of attributes with

varying scales, many machine learning algorithms can benefit from rescaling the attributes to all have the same scale. Often this is referred to as normalization and attributes are often rescaled into the range between 0 and 1.

   from sklearn.preprocessing import MinMaxScaler
   X = dataset.loc[:, dataset.columns != 'Risk_Code']
   scaler = MinMaxScaler(feature_range=(0, 1))
   rescaledX = pd.DataFrame(scaler.fit_transform(X))
   # summarize transformed data
   rescaledX.head(5)


### 4.4.2. Standardize Data Standardization is a useful technique to

transform attributes with a Gaussian distribution and differing means and standard deviations to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1.

   from sklearn.preprocessing import StandardScaler
   X = dataset.loc[:, dataset.columns != 'Risk_Code']
   scaler = StandardScaler().fit(X)
   StandardisedX = pd.DataFrame(scaler.fit_transform(X))
   # summarize transformed data
   StandardisedX.head(5)


### 4.4.1. Normalize Data Normalizing in scikit-learn refers to

rescaling each observation (row) to have a length of 1 (called a unit norm or a vector with the length of 1 in linear algebra).

from sklearn.preprocessing import Normalizer
X = dataset.loc[:, dataset.columns != 'Risk_Code']
scaler = Normalizer().fit(X)
NormalizedX = pd.DataFrame(scaler.fit_transform(X))
# summarize transformed data
NormalizedX.head(5)

5. Evaluate Algorithms and Models

## 5.1. Train Test Split

   # split out validation dataset for the end
   Y= dataset["Risk_Code"]
   X = dataset.loc[:, dataset.columns != 'Risk_Code']
   scaler = StandardScaler().fit(X)
   StandardisedX = pd.DataFrame(scaler.fit_transform(X))
   validation_size = 0.2
   seed = 7
   X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

## 5.2. Test Options and Evaluation Metrics
   # test options for classification
   num_folds = 10
   seed = 7
   scoring = 'accuracy'
   #scoring ='neg_log_loss'
   #scoring = 'roc_auc'

## 5.3. Compare Models and Algorithms

### 5.3.1. Common Models
   # spot check the algorithms
   models = []
   models.append(('LR', LogisticRegression()))
   models.append(('LDA', LinearDiscriminantAnalysis()))
   models.append(('KNN', KNeighborsClassifier()))
   models.append(('CART', DecisionTreeClassifier()))
   models.append(('NB', GaussianNB()))
   models.append(('SVM', SVC()))
   #Neural Network
   models.append(('NN', MLPClassifier()))

### 5.3.2. Ensemble Models
   #Ensable Models
   # Boosting methods
   models.append(('AB', AdaBoostClassifier()))
   models.append(('GBM', GradientBoostingClassifier()))
   # Bagging methods
   models.append(('RF', RandomForestClassifier()))
   models.append(('ET', ExtraTreesClassifier()))

### 5.3.3. Deep Learning Model
#Writing the Deep Learning Classifier in case the Deep Learning Flag is Set to True
#Set the following Flag to 0 if the Deep LEarning Models Flag has to be enabled
EnableDLModelsFlag = 1
if EnableDLModelsFlag == 1 :
    # Function to create model, required for KerasClassifier
    def create_model(neurons=12, activation='relu', learn_rate = 0.01, momentum=0):
        # create model
        model = Sequential()
        model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
        model.add(Dense(2, activation=activation))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        optimizer = SGD(lr=learn_rate, momentum=momentum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    models.append(('DNN', KerasClassifier(build_fn=create_model, epochs=10, batch_size=10, verbose=1)))

K-folds cross validation

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
LR: 0.626074 (0.064426)
LDA: 0.611614 (0.055923)
KNN: 0.529791 (0.063048)
CART: 0.563763 (0.097660)
NB: 0.611324 (0.061465)
SVM: 0.592102 (0.077275)
NN: 0.503775 (0.059635)
AB: 0.621138 (0.045846)
GBM: 0.633159 (0.076016)
RF: 0.618815 (0.077372)
ET: 0.582753 (0.074896)

Algorithm comparison

# compare algorithms
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
pyplot.show()
AIType/output_60_0.png

7. Finalise the Model

Looking at the details above GBM might be worthy of further study, but for now SVM shows a lot of promise as a low complexity and stable model for this problem.

Finalize Model with best parameters found during tuning step.

## 7.1. Results on the Test Dataset

# prepare model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingClassifier(n_estimators=20, max_depth=5) # rbf is default kernel
model.fit(X_train, Y_train)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=20,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
# estimate accuracy on validation set
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
0.6666666666666666
[[30 22]
 [13 40]]
              precision    recall  f1-score   support

           0       0.70      0.58      0.63        52
           1       0.65      0.75      0.70        53

    accuracy                           0.67       105
   macro avg       0.67      0.67      0.66       105
weighted avg       0.67      0.67      0.66       105
predictions
array([0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0])
Y_validation
   998    0
   989    1
   664    1
   474    0
   601    0
   918    0
   114    1
   7      1
   593    0
   201    1
   946    0
   156    1
   375    0
   513    1
   177    1
   89     0
   466    0
   537    1
   634    0
   927    0
   454    0
   648    0
   938    0
   530    1
   818    1
   498    1
   197    0
   961    1
   405    0
   432    1
   806    1
   35     0
   531    0
   334    0
   652    0
   22     1
   677    0
   605    1
   515    1
   51     1
   145    1
   729    1
   475    0
   313    0
   252    0
   97     1
   969    1
   88     1
   501    1
   38     1
   273    0
   793    1
   576    1
   479    1
   442    1
   320    0
   212    0
   172    0
   917    0
   812    0
   207    1
   72     1
   727    0
   491    0
   849    0
   919    0
   328    1
   834    0
   835    0
   721    0
   711    0
   347    1
   896    1
   831    0
   521    0
   930    1
   832    0
   623    1
   684    1
   666    1
   458    1
   157    1
   602    0
   284    1
   714    0
   107    1
   422    1
   653    0
   730    1
   416    0
   293    1
   923    1
   876    1
   191    0
   892    1
   709    1
   814    0
   471    0
   398    0
   506    1
   597    0
   44     0
   34     1
   840    0
   47     1
   Name: Risk_Code, dtype: int32



## 7.2. Variable Intuition/Feature Importance Looking at the details

above GBM might be worthy of further study, but for now SVM shows a lot of promise as a low complexity and stable model for this problem. Let us look into the Feature Importance of the GBM model

import pandas as pd
import numpy as np
model = GradientBoostingClassifier()
model.fit(rescaledX,Y_train)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
pyplot.show()
[0.14559042 0.02828504 0.45990366 0.23325303 0.00326138 0.02257884
 0.03420548 0.02710298 0.04581917]
# Save Model Using Pickle
from pickle import dump
from pickle import load

# save the model to disk
filename = 'finalized_model.sav'
dump(model, open(filename, 'wb'))
# some time later...
# load the model from disk
loaded_model = load(open(filename, 'rb'))
# estimate accuracy on validation set
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)
result = accuracy_score(Y_validation, predictions)
print(result)
0.7047619047619048