Python - Dimension Reduction – Auto Encoder

Data:

Employees when they sent job applicant (40 rows)

 

Mission:

Accuracy comparison experiment between Auto Encoder and K-Nearest Neighbours (KNN), Decision Tree, Naïve Bayes, Support Vector Machine

 

Library used:

Pandas

Numpy

Matplotlib

Seaborn

Scikit

Keras

 

Code:

import pandas as pd 

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

 

from sklearn.model_selection import train_test_split 

from sklearn.manifold import TSNE

from sklearn.preprocessing import MinMaxScaler 

from sklearn.metrics import accuracy_score

 

from keras.layers import Input, Dense

from keras.models import Model, Sequential

from keras import regularizers

 

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

 

url = 'https://raw.githubusercontent.com/kokocamp/vlog119/main/vlog119.csv'

vlog135 = pd.read_csv(url)

vlog135.describe()

 

X = vlog135[['gpa','gmat','work_experience']]

y = vlog135['admitted']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

 

tsne = TSNE(n_components = 3, random_state = 0)

X_tsne = tsne.fit_transform(X)

 

plt.scatter(X_tsne[np.where(y == 0), 0], 

                X_tsne[np.where(y == 0), 1],

                marker ='o', color ='y', linewidth ='1',

                alpha = 0.8, label ='Gagal')

plt.scatter(X_tsne[np.where(y == 1), 0],

                X_tsne[np.where(y == 1), 1],

                marker ='o', color ='k', linewidth ='1',

                alpha = 0.8, label ='Lulus')

plt.legend()

plt.show()

 

X_skala = MinMaxScaler().fit_transform(X)

X_gagal_skala = X_skala[y == 0]

X_lulus_skala = X_skala[y == 1]

 

# Building the Input Layer

input_layer = Input(shape =(X.shape[1], ))

 

# Building the Encoder network

encoded = Dense(100, activation ='tanh',

                activity_regularizer = regularizers.l1(10e-5))(input_layer)

encoded = Dense(50, activation ='tanh',

                activity_regularizer = regularizers.l1(10e-5))(encoded)

encoded = Dense(25, activation ='tanh',

                activity_regularizer = regularizers.l1(10e-5))(encoded)

encoded = Dense(12, activation ='tanh',

                activity_regularizer = regularizers.l1(10e-5))(encoded)

encoded = Dense(6, activation ='relu')(encoded)

 

# Building the Decoder network

decoded = Dense(12, activation ='tanh')(encoded)

decoded = Dense(25, activation ='tanh')(decoded)

decoded = Dense(50, activation ='tanh')(decoded)

decoded = Dense(100, activation ='tanh')(decoded)

 

# Building the Output Layer

output_layer = Dense(X.shape[1], activation ='relu')(decoded)

 

# Defining the parameters of the Auto-encoder network

autoencoder = Model(input_layer, output_layer)

autoencoder.compile(optimizer ="adadelta", loss ="mse")

 

# Training the Auto-encoder network

autoencoder.fit(X_lulus_skala, X_lulus_skala, 

                batch_size = 16, epochs = 10, 

                shuffle = True, validation_split = 0.25)

 

hidden_representation = Sequential()

hidden_representation.add(autoencoder.layers[0])

hidden_representation.add(autoencoder.layers[1])

hidden_representation.add(autoencoder.layers[2])

hidden_representation.add(autoencoder.layers[3])

hidden_representation.add(autoencoder.layers[4])

 

# Separating the points encoded by the Auto-encoder as normal and fraud

gagal_hidden_rep = hidden_representation.predict(X_gagal_skala)

lulus_hidden_rep = hidden_representation.predict(X_lulus_skala)

 

# Combining the encoded points into a single table

encoded_X = np.append(gagal_hidden_rep, lulus_hidden_rep, axis = 0)

y_gagal = np.zeros(gagal_hidden_rep.shape[0])

y_lulus = np.ones(lulus_hidden_rep.shape[0])

encoded_y = np.append(y_gagal, y_lulus)

 

# Plotting the encoded points

#tsne_plot(encoded_X, encoded_y)

 

tsne = TSNE(n_components = 2, random_state = 0)

X_tsne = tsne.fit_transform(encoded_X)

 

plt.scatter(X_tsne[np.where(encoded_y == 0), 0], 

                X_tsne[np.where(encoded_y == 0), 1],

                marker ='o', color ='y', linewidth ='1',

                alpha = 0.8, label ='Gagal')

plt.scatter(X_tsne[np.where(encoded_y == 1), 0],

                X_tsne[np.where(encoded_y == 1), 1],

                marker ='o', color ='k', linewidth ='1',

                alpha = 0.8, label ='Lulus')

plt.legend()

plt.show()

 

# Splitting the encoded data for linear classification

X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(encoded_X, encoded_y, test_size = 0.25)

 

# Splitting the original data for non-linear classification

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

 

# Building the logistic regression model

logistic_regression = LogisticRegression()

logistic_regression.fit(X_train_encoded, y_train_encoded)

 

# Storing the predictions of the linear model

y_pred_logistic_regression = logistic_regression.predict(X_test_encoded)

 

# Evaluating the performance of the linear model

print('Accuracy : '+str(accuracy_score(y_test_encoded, y_pred_logistic_regression)))

 

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

 

# Storing the predictions of the non-linear model

y_pred_knn = knn.predict(X_test)

 

# Evaluating the performance of the non-linear model

print('Accuracy : '+str(accuracy_score(y_test, y_pred_knn)))

 

d3 = DecisionTreeClassifier()

d3.fit(X_train, y_train)

 

# Storing the predictions of the non-linear model

y_pred_d3 = d3.predict(X_test)

 

# Evaluating the performance of the non-linear model

print('Accuracy : '+str(accuracy_score(y_test, y_pred_d3)))

 

nb = GaussianNB()

nb.fit(X_train, y_train)

 

# Storing the predictions of the non-linear model

y_pred_nb = nb.predict(X_test)

 

# Evaluating the performance of the non-linear model

print('Accuracy : '+str(accuracy_score(y_test, y_pred_nb)))

 

# Building the SVM model

svm = SVC()

svm.fit(X_train, y_train)

 

# Storing the predictions of the non-linear model

y_pred_svm = svm.predict(X_test)

 

# Evaluating the performance of the non-linear model

print('Accuracy : '+str(accuracy_score(y_test, y_pred_svm)))

I wrapped the scenario in a Youtube video below.


 

Click this link (http://paparadit.blogspot.com/2020/11/the-algorithms-of-machine-learning.html), if you want to check out for other algorithms. Thank you for for visiting this blog & subs my channel.

Labels: ,


PS: If you've benefit from this blog,
you can support it by making a small contribution.

Enter your email address to receive feed update from this blog:

Post a Comment

 

Post a Comment

Leave comments here...