Making Sense of Machine Learning with Explanable Artificial Intelligence

Python Supervised Machine Learning

I will be applying the methods of Explanable Artificial Intelligence (XAI) to extract interpretable insights from a classification model that predicts students’ grade repetition.

(14 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2022-05-05

Introduction

Show code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from imblearn.combine import SMOTEENN

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 123
Show code
df = pd.read_csv("PISA_TH.csv")

X = df.drop('REPEAT', axis=1)
y = df['REPEAT']

df.head()
   REPEAT    ESCS  DAYSKIP  ...  Invest_effort  WEALTH  Home_resource
0       0 -0.7914        1  ...              6  0.0721        -1.4469
1       0  0.8188        1  ...              8 -0.3429         1.1793
2       0  0.4509        1  ...             10  0.3031         1.1793
3       0  0.7086        1  ...             10 -0.5893        -0.1357
4       0  0.8361        1  ...             10  0.5406         1.1793

[5 rows x 25 columns]

Addressing Sample Imbalance

Show code
Counter(y)
Counter({0: 8044, 1: 589})
Show code
tsne = TSNE(n_components=2, random_state=RANDOM_STATE)

TSNE_result = tsne.fit_transform(X)

plt.figure(figsize=(12,8))
sns.scatterplot(TSNE_result[:,0], TSNE_result[:,1], hue=y, legend='full', palette="hls")

ENN Editing with 1-NN Classifier. No copyright infringement is intended
Show code
smote_enn = SMOTEENN(random_state=RANDOM_STATE, sampling_strategy = 'minority', n_jobs=-1)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)
Counter({1: 8040, 0: 4794})
Show code
tsne = TSNE(n_components=2, random_state=RANDOM_STATE)

TSNE_result = tsne.fit_transform(X_resampled)

plt.figure(figsize=(12,8))
sns.scatterplot(TSNE_result[:,0], TSNE_result[:,1], hue=y_resampled, legend='full', palette="hls")

Random Forest Ensemble

Show code
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=RANDOM_STATE)


X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.30, 
                                                    random_state = RANDOM_STATE)
Show code
# random forest model creation
clf_rfc = RandomForestClassifier(random_state=RANDOM_STATE)
clf_rfc.fit(X_train, y_train)

# predictions
RandomForestClassifier(random_state=123)
Show code
rfc_predict = clf_rfc.predict(X_test)

rfc_cv_score = cross_val_score(clf_rfc, X_resampled, y_resampled, cv=CV, scoring='roc_auc')

print("=== All AUC Scores ===")
=== All AUC Scores ===
Show code
print(rfc_cv_score)
[0.98907675 0.99357898 0.99228597 0.99276793 0.99409399 0.99378369
 0.9931644  0.9901627  0.98967714 0.99287747 0.99384328 0.99252177
 0.99452348 0.99187137 0.99040419 0.99201929 0.9896265  0.99140778
 0.99115331 0.99457436]
Show code
print('\n')
Show code
print("=== Mean AUC Score ===")
=== Mean AUC Score ===
Show code
print("Mean AUC Score - RandForest: ", rfc_cv_score.mean())
Mean AUC Score - RandForest:  0.9921707177188173
Show code
#define metrics for normal RF
from sklearn import metrics

y_pred_proba_rf = clf_rfc.predict_proba(X_test)[::,1]
fpr_rf, tpr_rf, _ = metrics.roc_curve(y_test,  y_pred_proba_rf)

auc_rf = metrics.roc_auc_score(y_test, y_pred_proba_rf)
plt.plot(fpr_rf,tpr_rf, label="AUC for Random Forest Classifier = "+str(auc_rf.round(3)))
[<matplotlib.lines.Line2D object at 0x000001EA9D9FA340>]
Show code
plt.legend(loc="lower right")
<matplotlib.legend.Legend object at 0x000001EA9D9FA220>
Show code
plt.ylabel('True Positive Rate')
Text(0, 0.5, 'True Positive Rate')
Show code
plt.xlabel('False Positive Rate')
           
Text(0.5, 0, 'False Positive Rate')
Show code
plt.title("Receiver-Operator Curve (ROC)")
Text(0.5, 1.0, 'Receiver-Operator Curve (ROC)')
Show code
plt.show()

Explaining AI

Permutation Importance

Show code
# Create a pd.Series of features importances
importances_rf = pd.Series(clf_rfc.feature_importances_, index = X_resampled.columns)

# Sort importances_rf
sorted_importance_rf = importances_rf.sort_values()

#Horizontal bar plot
sorted_importance_rf.plot(kind='barh', color='lightgreen'); 
plt.xlabel('Feature Importance Score')
Text(0.5, 0, 'Feature Importance Score')
Show code
plt.ylabel('Features')
Text(0, 0.5, 'Features')
Show code
plt.title("Visualizing Important Features")
Text(0.5, 1.0, 'Visualizing Important Features')
Show code
plt.show()

Show code
import eli5
from eli5.sklearn import PermutationImportance

FEATURES = X_test.columns.tolist()

perm = PermutationImportance(clf_rfc, random_state=RANDOM_STATE).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = FEATURES, top = 10)
<IPython.core.display.HTML object>
Permutation Importance

Partial Dependence Plots

Show code
from pdpbox import pdp

pdp_bullied = pdp.pdp_isolate(model=clf_rfc, dataset=X_test, model_features=FEATURES, feature='BEINGBULLIED')

pdp.pdp_plot(pdp_bullied, 'BEINGBULLIED')
(<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='BEINGBULLIED'>})
Show code
plt.show()

Show code
features_to_plot = ['BEINGBULLIED', 'Parent_emosup']

inter1  =  pdp.pdp_interact(model=clf_rfc, dataset=X_test, model_features=FEATURES, features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
(<Figure size 750x950 with 3 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_inter_ax': <AxesSubplot:xlabel='BEINGBULLIED', ylabel='Parent_emosup'>})
Show code
plt.show()

SHAP Values

SHAP Value for a prediction

SHAP Summary Plot

Show code
shap_values_summary = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)
SHAP Summary Plot

SHAP Dependence Contribution Plot

Show code
shap.dependence_plot('BEINGBULLIED', shap_values_summary[1], X_test, interaction_index="Positive_feel")
SHAP Dependence Contribution Plot

Conclusion

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2022, May 5). Tarid Wongvorachan: Making Sense of Machine Learning with Explanable Artificial Intelligence. Retrieved from https://taridwong.github.io/posts/2022-04-28-xai/

BibTeX citation

@misc{wongvorachan2022making,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Making Sense of Machine Learning with Explanable Artificial Intelligence},
  url = {https://taridwong.github.io/posts/2022-04-28-xai/},
  year = {2022}
}