Combining Multiple Machine Learning Models with the Ensemble Methods

Python Supervised Machine Learning

This entry explores different ways to combine supervised machine learning models to maximize their predictive capability.

(13 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2022-04-09

What is ensemble methods?

Image from https://www.irasutoya.com. No copyright infringement is intended

Setting up the Environment and Data set

Show code
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
Show code
df = pd.read_csv("mushrooms.csv")

# data set shape
print("There are {} rows and {} columns in this dataset".format(df.shape[0], df.shape[1]))
There are 8124 rows and 23 columns in this dataset
Show code
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring    8124 non-null   object
 15  stalk-color-below-ring    8124 non-null   object
 16  veil-type                 8124 non-null   object
 17  veil-color                8124 non-null   object
 18  ring-number               8124 non-null   object
 19  ring-type                 8124 non-null   object
 20  spore-print-color         8124 non-null   object
 21  population                8124 non-null   object
 22  habitat                   8124 non-null   object
dtypes: object(23)
memory usage: 1.4+ MB
Show code
class_dict= {'e': 'edible' , 'p':'poisonous'}
df['class'] = df['class'].map(class_dict)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="class", data = df)
plt.show()

Data Preprocessing

Show code

from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()

for col in df.columns:
    df[col] = labelencoder.fit_transform(df[col])

#The machine does not know about mushrooms. It only knows pattern of the data as reflected by the type of occurrence.

#Checking the encoded values
df['stalk-color-above-ring'].unique()
array([7, 3, 6, 4, 0, 2, 5, 1, 8])
Show code
print(df.groupby('class').size())
class
0    4208
1    3916
dtype: int64
Show code
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="class", data = df)
plt.show()

Show code

RANDOM_STATE = 123

X = df.drop('class', axis=1) #features
y = df['class'] #label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, 
                                                    random_state = RANDOM_STATE)

Performance of a Single Model

Naive Bayes

Show code
#Instantiate a Naive Bayes classifier
clf_nb = GaussianNB()

# Fit the model to the training set
clf_nb.fit(X_train,y_train)

# Calculate the predictions on the test set
GaussianNB()
Show code
pred_nb = clf_nb.predict(X_test)

# Evaluate the performance using the accuracy score
print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_nb)))
Accuracy: 0.9130
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_nb)))
F1: 0.9091

Logistic Regression

Show code
clf_lr = LogisticRegression(max_iter = 450, random_state = RANDOM_STATE)

clf_lr.fit(X_train,y_train)
LogisticRegression(max_iter=450, random_state=123)
Show code
pred_lr = clf_lr.predict(X_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_lr)))
Accuracy: 0.9471
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_lr)))
F1: 0.9444

Voting and Averaging

Show code
clf_knn = KNeighborsClassifier(n_neighbors = 5)

clf_dt = DecisionTreeClassifier(min_samples_leaf = 3, min_samples_split = 9, random_state = RANDOM_STATE)

from mlxtend.classifier import EnsembleVoteClassifier

clf_vote = EnsembleVoteClassifier(clfs=[clf_nb, clf_knn, clf_dt], voting = "hard")

clf_vote.fit(X_train, y_train)
EnsembleVoteClassifier(clfs=[GaussianNB(), KNeighborsClassifier(),
                             DecisionTreeClassifier(min_samples_leaf=3,
                                                    min_samples_split=9,
                                                    random_state=123)])
Show code
pred_vote = clf_vote.predict(X_test)

score_vote = f1_score(pred_vote, y_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_vote)))
Accuracy: 0.9996
Show code
print('F1-Score: {:.3f}'.format(score_vote))
F1-Score: 1.000
Show code
clf_ave = EnsembleVoteClassifier(clfs=[clf_nb, clf_lr, clf_knn, clf_dt], voting = "soft")

clf_ave.fit(X_train, y_train)
EnsembleVoteClassifier(clfs=[GaussianNB(),
                             LogisticRegression(max_iter=450, random_state=123),
                             KNeighborsClassifier(),
                             DecisionTreeClassifier(min_samples_leaf=3,
                                                    min_samples_split=9,
                                                    random_state=123)],
                       voting='soft')
Show code
pred_ave = clf_ave.predict(X_test)

score_ave = f1_score(pred_ave, y_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_ave)))
Accuracy: 0.9979
Show code
print('F1-Score: {:.3f}'.format(score_ave))
F1-Score: 0.998

Bagging

Show code
# random forest model creation
clf_rf = RandomForestClassifier(random_state = RANDOM_STATE)
clf_rf.fit(X_train, y_train)

# predictions
RandomForestClassifier(random_state=123)
Show code
pred_rfc = clf_rf.predict(X_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_rfc)))
Accuracy: 1.0000
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_rfc)))
F1: 1.0000
Show code
from sklearn import tree

fn = X.columns #for features
cn = df.columns[0] #for class

fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
for index in range(0, 5):
    tree.plot_tree(clf_rf.estimators_[index],
                   feature_names = fn, 
                   class_names = cn,
                   filled = True,
                   ax = axes[index]);

    axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('rf_5trees.png')
Five Samples of Trees from the Random Forest Model

Boosting

Show code
#Call the model
clf_gbm = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, random_state=RANDOM_STATE)

#Fit the model
clf_gbm.fit(X_train, y_train)
GradientBoostingClassifier(random_state=123)
Show code
pred_gbm = clf_gbm.predict(X_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_gbm)))
Accuracy: 1.0000
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_gbm)))
F1: 1.0000
Show code
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', n_estimators = 10, seed = RANDOM_STATE, use_label_encoder = False)
clf_xgb.fit(X_train, y_train)

#n_estimator is the number of boosting round

# predictions
[23:57:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
Show code
pred_xgb = clf_xgb.predict(X_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_xgb)))
Accuracy: 1.0000
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_xgb)))
F1: 1.0000

Stacking

Model stacking Flow chart. Image from https://towardsai.net/p/l/machine-learning-model-stacking-in-python. No copyright infringement is intended
Show code
clf_stack = StackingClassifier(classifiers=[clf_knn, clf_dt, clf_nb, clf_rf], meta_classifier=clf_lr)
clf_stack.fit(X_train, y_train)
StackingClassifier(classifiers=[KNeighborsClassifier(),
                                DecisionTreeClassifier(min_samples_leaf=3,
                                                       min_samples_split=9,
                                                       random_state=123),
                                GaussianNB(),
                                RandomForestClassifier(random_state=123)],
                   meta_classifier=LogisticRegression(max_iter=450,
                                                      random_state=123))
Show code
pred_stack = clf_stack.predict(X_test)

print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_stack)))
Accuracy: 1.0000
Show code
print("F1: {:0.4f}".format(f1_score(y_test, pred_stack)))
F1: 1.0000
Show code
report_stack = classification_report(y_test, pred_stack)
print(report_stack)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1271
           1       1.00      1.00      1.00      1167

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438
Show code
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_stack)
array([[1271,    0],
       [   0, 1167]], dtype=int64)
Show code
params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta_classifier__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=clf_stack, 
                    param_grid=params, 
                    cv=5,
                    refit=True)

grid.fit(X_train, y_train)
GridSearchCV(cv=5,
             estimator=StackingClassifier(classifiers=[KNeighborsClassifier(),
                                                       DecisionTreeClassifier(min_samples_leaf=3,
                                                                              min_samples_split=9,
                                                                              random_state=123),
                                                       GaussianNB(),
                                                       RandomForestClassifier(random_state=123)],
                                          meta_classifier=LogisticRegression(max_iter=450,
                                                                             random_state=123)),
             param_grid={'kneighborsclassifier__n_neighbors': [1, 5],
                         'meta_classifier__C': [0.1, 10.0],
                         'randomforestclassifier__n_estimators': [10, 50]})
Show code
cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}
Show code
print('Best parameters: %s' % grid.best_params_)
Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
Show code
print('Accuracy: %.2f' % grid.best_score_)
Accuracy: 1.00
Show code
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X_np = X_train.to_numpy()
y_np = y_train.to_numpy()

X_np_reduced = pca.fit_transform(X_np)

gs = gridspec.GridSpec(2, 2)

fig = plt.figure(figsize=(10,8))

for clf, lab, grd in zip([clf_knn, clf_dt, clf_nb, clf_stack], 
                         ['KNN', 
                          'Decision Tree', 
                          'Naive Bayes',
                          'StackingClassifier'],
                          itertools.product([0, 1], repeat=2)):

    clf.fit(X_np_reduced, y_np)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X_np_reduced, y=y_np, clf=clf)
    plt.title(lab)
    
KNeighborsClassifier()
Text(0.5, 1.0, 'KNN')
DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9,
                       random_state=123)
Text(0.5, 1.0, 'Decision Tree')
GaussianNB()
Text(0.5, 1.0, 'Naive Bayes')
StackingClassifier(classifiers=[KNeighborsClassifier(),
                                DecisionTreeClassifier(min_samples_leaf=3,
                                                       min_samples_split=9,
                                                       random_state=123),
                                GaussianNB(),
                                RandomForestClassifier(random_state=123)],
                   meta_classifier=LogisticRegression(max_iter=450,
                                                      random_state=123))
Text(0.5, 1.0, 'StackingClassifier')
Show code
plt.show()

Conclusion

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2022, April 9). Tarid Wongvorachan: Combining Multiple Machine Learning Models with the Ensemble Methods. Retrieved from https://taridwong.github.io/posts/2022-04-09-ensemble/

BibTeX citation

@misc{wongvorachan2022combining,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Combining Multiple Machine Learning Models with the Ensemble Methods},
  url = {https://taridwong.github.io/posts/2022-04-09-ensemble/},
  year = {2022}
}