Addressing Data Imbalance with Semi-Supervised Learning

For this post, I will use semi-supervised learning approach to perform a classification task with a highly imbalance data.

Tarid Wongvorachan (University of Alberta)


Image from No copyright infringement is intended
Data Set Preparation

from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns
data = pd.read_csv("creditcard.csv")
data["Time"] = data["Time"].apply(lambda x : x / 3600 % 24)
       Time        V1        V2        V3  ...       V27       V28  Amount  Class
0  0.000000 -1.359807 -0.072781  2.536347  ...  0.133558 -0.021053  149.62      0
1  0.000000  1.191857  0.266151  0.166480  ... -0.008983  0.014724    2.69      0
2  0.000278 -1.358354 -1.340163  1.773209  ... -0.055353 -0.059752  378.66      0
3  0.000278 -0.966272 -0.185226  1.792993  ...  0.062723  0.061458  123.50      0
4  0.000556 -1.158233  0.877737  1.548718  ...  0.219422  0.215153   69.99      0

[5 rows x 31 columns]
vc = data['Class'].value_counts().to_frame().reset_index()
vc['percent'] = vc["Class"].apply(lambda x : round(100*float(x) / len(data), 2))
vc = vc.rename(columns = {"index" : "Target", "Class" : "Count"})
   Target   Count  percent
0       0  284315    99.83
1       1     492     0.17
non_fraud = data[data['Class'] == 0].sample(1000)
fraud = data[data['Class'] == 1]

df = non_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
X = df.drop(['Class'], axis = 1).values
Y = df["Class"].values  

Visualize Fraud and NonFraud Transactions

tsne = TSNE(n_components=2, random_state=0)

TSNE_result = tsne.fit_transform(X)

sns.scatterplot(TSNE_result[:,0], TSNE_result[:,1], hue=Y, legend='full', palette="hls")


Image from No copyright infringement is intended
- First, we will create a network with one input layer and one output layer. Both of them will have identical dimensions.

## input layer 
input_layer = Input(shape=(X.shape[1],))

## encoding part
encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

## decoding part
decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

## output layer
output_layer = Dense(X.shape[1], activation='relu')(decoded)
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")
x = data.drop(["Class"], axis=1)
y = data["Class"].values

x_scale = preprocessing.MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]
                batch_size = 256, epochs = 10, 
                shuffle = True, validation_split = 0.20)

Obtain the Generated Data

hidden_representation = Sequential()
norm_hid_rep = hidden_representation.predict(x_norm[:3000])
fraud_hid_rep = hidden_representation.predict(x_fraud)
new_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
new_y_not_fraud = np.zeros(norm_hid_rep.shape[0])
new_y_fraud = np.ones(fraud_hid_rep.shape[0])
new_y = np.append(new_y_not_fraud, new_y_fraud)
TSNE_result_new = tsne.fit_transform(new_x)
sns.scatterplot(TSNE_result_new[:,0], TSNE_result_new[:,1], hue=new_y, legend='full', palette="hls")

Linear Classifier

X_train, X_test, y_train, y_test = train_test_split(new_x, new_y, test_size=0.3)

clf_lr = LogisticRegression(max_iter = 450, random_state = 123),y_train)  
LogisticRegression(max_iter=450, random_state=123)
pred_lr = clf_lr.predict(X_test)
print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_lr)))  
Accuracy: 0.9781
report_lr = classification_report(y_test, pred_lr)
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       894
         1.0       1.00      0.85      0.92       154

    accuracy                           0.98      1048
   macro avg       0.99      0.93      0.95      1048
weighted avg       0.98      0.98      0.98      1048
conf_matrix = confusion_matrix(y_test, pred_lr)
sns.heatmap(conf_matrix.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.title("Confusion matrix of Logistic Regression")


