Making Sense of Machine Learning with Explanable Artificial Intelligence

Python Supervised Machine Learning

I will be applying the methods of Explanable Artificial Intelligence (XAI) to extract interpretable insights from a classification model that predicts students’ grade repetition.

Tarid Wongvorachan (University of Alberta)


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from imblearn.combine import SMOTEENN

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import warnings

df = pd.read_csv("PISA_TH.csv")

X = df.drop('REPEAT', axis=1)
y = df['REPEAT']

   REPEAT    ESCS  DAYSKIP  ...  Invest_effort  WEALTH  Home_resource
0       0 -0.7914        1  ...              6  0.0721        -1.4469
1       0  0.8188        1  ...              8 -0.3429         1.1793
2       0  0.4509        1  ...             10  0.3031         1.1793
3       0  0.7086        1  ...             10 -0.5893        -0.1357
4       0  0.8361        1  ...             10  0.5406         1.1793

[5 rows x 25 columns]

Addressing Sample Imbalance

Counter({0: 8044, 1: 589})
tsne = TSNE(n_components=2, random_state=RANDOM_STATE)

TSNE_result = tsne.fit_transform(X)

sns.scatterplot(TSNE_result[:,0], TSNE_result[:,1], hue=y, legend='full', palette="hls")

ENN Editing with 1-NN Classifier. No copyright infringement is intended
smote_enn = SMOTEENN(random_state=RANDOM_STATE, sampling_strategy = 'minority', n_jobs=-1)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter({1: 8040, 0: 4794})
tsne = TSNE(n_components=2, random_state=RANDOM_STATE)

TSNE_result = tsne.fit_transform(X_resampled)

sns.scatterplot(TSNE_result[:,0], TSNE_result[:,1], hue=y_resampled, legend='full', palette="hls")

Random Forest Ensemble

CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=RANDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.30, 
                                                    random_state = RANDOM_STATE)
# random forest model creation
clf_rfc = RandomForestClassifier(random_state=RANDOM_STATE), y_train)

# predictions
rfc_predict = clf_rfc.predict(X_test)

rfc_cv_score = cross_val_score(clf_rfc, X_resampled, y_resampled, cv=CV, scoring='roc_auc')

print("=== All AUC Scores ===")
=== All AUC Scores ===
[0.98907675 0.99357898 0.99228597 0.99276793 0.99409399 0.99378369
 0.9931644  0.9901627  0.98967714 0.99287747 0.99384328 0.99252177
 0.99452348 0.99187137 0.99040419 0.99201929 0.9896265  0.99140778
 0.99115331 0.99457436]
print("=== Mean AUC Score ===")
=== Mean AUC Score ===
Show code
print("Mean AUC Score - RandForest: ", rfc_cv_score.mean())
Mean AUC Score - RandForest:  0.9921707177188173
#define metrics for normal RF
from sklearn import metrics

y_pred_proba_rf = clf_rfc.predict_proba(X_test)[::,1]
fpr_rf, tpr_rf, _ = metrics.roc_curve(y_test,  y_pred_proba_rf)

auc_rf = metrics.roc_auc_score(y_test, y_pred_proba_rf)
plt.plot(fpr_rf,tpr_rf, label="AUC for Random Forest Classifier = "+str(auc_rf.round(3)))
[<matplotlib.lines.Line2D object at 0x000001EA9D9FA340>]
plt.legend(loc="lower right")
<matplotlib.legend.Legend object at 0x000001EA9D9FA220>
plt.ylabel('True Positive Rate')
Text(0, 0.5, 'True Positive Rate')
plt.xlabel('False Positive Rate')
Text(0.5, 0, 'False Positive Rate')
plt.title("Receiver-Operator Curve (ROC)")
Text(0.5, 1.0, 'Receiver-Operator Curve (ROC)')
Explaining AI

Permutation Importance

Show code
# Create a pd.Series of features importances
importances_rf = pd.Series(clf_rfc.feature_importances_, index = X_resampled.columns)

# Sort importances_rf
sorted_importance_rf = importances_rf.sort_values()

#Horizontal bar plot
sorted_importance_rf.plot(kind='barh', color='lightgreen'); 
plt.xlabel('Feature Importance Score')
Text(0.5, 0, 'Feature Importance Score')
Text(0, 0.5, 'Features')
plt.title("Visualizing Important Features")
Text(0.5, 1.0, 'Visualizing Important Features')
import eli5
from eli5.sklearn import PermutationImportance

FEATURES = X_test.columns.tolist()

perm = PermutationImportance(clf_rfc, random_state=RANDOM_STATE).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = FEATURES, top = 10)
<IPython.core.display.HTML object>
Permutation Importance

Partial Dependence Plots

from pdpbox import pdp

pdp_bullied = pdp.pdp_isolate(model=clf_rfc, dataset=X_test, model_features=FEATURES, feature='BEINGBULLIED')

pdp.pdp_plot(pdp_bullied, 'BEINGBULLIED')
(<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='BEINGBULLIED'>})
features_to_plot = ['BEINGBULLIED', 'Parent_emosup']

inter1  =  pdp.pdp_interact(model=clf_rfc, dataset=X_test, model_features=FEATURES, features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
(<Figure size 750x950 with 3 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_inter_ax': <AxesSubplot:xlabel='BEINGBULLIED', ylabel='Parent_emosup'>})
Show code

SHAP Values

SHAP Value for a prediction

SHAP Summary Plot

shap_values_summary = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)
SHAP Summary Plot

SHAP Dependence Contribution Plot

shap.dependence_plot('BEINGBULLIED', shap_values_summary[1], X_test, interaction_index="Positive_feel")
SHAP Dependence Contribution Plot



