%matplotlib inline

import os

import numpy as np
# save np.load
np_load_old = np.load
# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

import scipy
import scipy.io

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D

from IPython.display import display, Math

np.random.seed(7720)


def show(text, ans, precision=4):
    if type(ans) == np.ndarray:
        t = r'\begin{bmatrix} '
        for i in ans:
#             print(i, type(i))
#             print(r' \\ '.join(i))
            if type(i) != np.ndarray:
                t += f'{i:.{precision}f}' + r' \\ '
            else:
                a_str = np.array2string(i, precision=precision, separator=r' & ')
                t += a_str[1:-1]
                t += r' \\ '
        t += r'\end{bmatrix}'
        display(Math(f'{text} = {t}'))
    else:
        display(Math(f'{text} = {ans:.{precision}f}'))


def show_percent(text, ans, precision=2):
    display(Math(f'{text} = {ans:.{precision}f}\%'))


# C = 3
# data = {}
# data['train1'] = [None] * C
# data['train2'] = [None] * C
# data['test1'] = [None] * C
# data['test2'] = [None] * C

# mu = [
#     np.array([2, 3, 1, 5.5, 8.7]),
#     np.array([-4.5, 6, -1, 3, 10]),
#     np.array([1.2, -2.3, 1.5, -0.5, 2.7]),
# ]
      
# cov = [
#     np.identity(5) * np.array([1, 0.5, 2.5, 0.7, 3.5]),
#     np.array([
#         [2, 0, 1, 0.5, 0],
#         [0, 3.5, 0, 0, 0.6],
#         [1, 0, 4.5, 1.2, 0],
#         [0.5, 0, 1.2, 1.6, 0],
#         [0, 0.6, 0, 0, 2.5]
#     ]),
#     np.array([
#         [4.2, 0, 1.3, 2.5, 1.4],
#         [0, 5, 0, 0, 3.6],
#         [1.3, 0, 4.5, 4.2, 0],
#         [2.5, 0, 4.2, 5.6, 0],
#         [1.4, 3.6, 0, 0, 7.5]
#     ]),
# ]    

# for c in range(C):
#     data['train1'][c] = np.random.multivariate_normal(mu[c], cov[c], 50)
#     data['train2'][c] = np.random.multivariate_normal(mu[c], cov[c], 500)
#     data['test1'][c] = np.random.multivariate_normal(mu[c], cov[c], 500)
#     data['test2'][c] = np.random.multivariate_normal(mu[c], cov[c], 10000)
    
# data['mu'] = mu
# data['cov'] = cov
# np.save('data/mp6.npy', data)


data = np.load('data/mp6.npy').item()
C = len(data['mu'])


show(r'\vec{\mu}_1^{\mathsf{T}}', data['mu'][0][:, None].T, precision=1)
show(r'\vec{\mu}_2^{\mathsf{T}}', data['mu'][1][:, None].T, precision=1)
show(r'\vec{\mu}_3^{\mathsf{T}}', data['mu'][2][:, None].T, precision=1)


show(r'\Sigma_1', data['cov'][0], precision=1)
show(r'\Sigma_2', data['cov'][1], precision=1)
show(r'\Sigma_3', data['cov'][2], precision=1)


def squared_mahalanobis_distance(x, y, cov):
    a = np.array(x) - np.array(y)
    r2 = a.T @ np.linalg.inv(cov) @ a
    return r2


def discriminant_function(x, mean, cov):
    d = len(x)
    A = -0.5 * squared_mahalanobis_distance(x, mean, cov)
#     B = - ((d/2) * np.log(2 * np.pi))
    C = - (0.5 * np.log(np.linalg.det(cov)))
#     D = np.log(prior)
    return A + C


a_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test1'][c]:
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(x, data['mu'][i], data['cov'][i])
        pred_class = np.argmax(distances)
        a_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ I]\ a)\ \ Confusion\ matrix}', a_confusion_matrix)


def accuracy(cm):
    return cm.trace() / cm.sum(axis=None) * 100


show_percent(r'\mathrm{[Part\ I]\ a)\ accuracy}', accuracy(a_confusion_matrix))
show_percent(r'\mathrm{[Part\ I]\ a)\ error}', 100 - accuracy(a_confusion_matrix))


b_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(x, data['mu'][i], data['cov'][i])
        pred_class = np.argmax(distances)
        b_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ I]\ b)\ \ Confusion\ matrix}', b_confusion_matrix)


show_percent(r'\mathrm{[Part\ I]\ b)\ accuracy}', accuracy(b_confusion_matrix))
show_percent(r'\mathrm{[Part\ I]\ b)\ error}', 100 - accuracy(b_confusion_matrix))


c_MLE_mu = np.full_like(data['mu'], np.nan)
c_MLE_cov = np.full_like(data['cov'], np.nan)

for c in range(C):
    c_MLE_mu[c] = np.mean(data['train1'][c], axis=0)
    c_MLE_cov[c] = np.cov(data['train1'][c].T)


show('\mathrm{[Part\ I]\ c)\ \ MLE\ \mu_1^{\mathsf{T}}}', c_MLE_mu[0][:, None].T)
show('\mathrm{[Part\ I]\ c)\ \ MLE\ \mu_2^{\mathsf{T}}}', c_MLE_mu[1][:, None].T)
show('\mathrm{[Part\ I]\ c)\ \ MLE\ \mu_3^{\mathsf{T}}}', c_MLE_mu[2][:, None].T)


show('\mathrm{[Part\ I]\ c)\ \ MLE\ \Sigma_1}', c_MLE_cov[0])
show('\mathrm{[Part\ I]\ c)\ \ MLE\ \Sigma_2}', c_MLE_cov[1])
show('\mathrm{[Part\ I]\ c)\ \ MLE\ \Sigma_3}', c_MLE_cov[2])


c_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(x, c_MLE_mu[i], c_MLE_cov[i])
        pred_class = np.argmax(distances)
        c_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ I]\ c)\ \ Confusion\ matrix}', c_confusion_matrix)


show_percent(r'\mathrm{[Part\ I]\ c)\ accuracy}', accuracy(c_confusion_matrix))
show_percent(r'\mathrm{[Part\ I]\ c)\ error}', 100 - accuracy(c_confusion_matrix))


d_MLE_mu = np.full_like(data['mu'], np.nan)
d_MLE_cov = np.full_like(data['cov'], np.nan)

for c in range(C):
    d_MLE_mu[c] = np.mean(data['train2'][c], axis=0)
    d_MLE_cov[c] = np.cov(data['train2'][c].T)


show('\mathrm{[Part\ I]\ d)\ \ MLE\ \mu_1^{\mathsf{T}}}', d_MLE_mu[0][:, None].T)
show('\mathrm{[Part\ I]\ d)\ \ MLE\ \mu_2^{\mathsf{T}}}', d_MLE_mu[1][:, None].T)
show('\mathrm{[Part\ I]\ d)\ \ MLE\ \mu_3^{\mathsf{T}}}', d_MLE_mu[2][:, None].T)


show('\mathrm{[Part\ I]\ d)\ \ MLE\ \Sigma_1}', d_MLE_cov[0])
show('\mathrm{[Part\ I]\ d)\ \ MLE\ \Sigma_2}', d_MLE_cov[1])
show('\mathrm{[Part\ I]\ d)\ \ MLE\ \Sigma_3}', d_MLE_cov[2])


d_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(x, d_MLE_mu[i], d_MLE_cov[i])
        pred_class = np.argmax(distances)
        d_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ I]\ d)\ \ Confusion\ matrix}', d_confusion_matrix)


show_percent(r'\mathrm{[Part\ I]\ d)\ accuracy}', accuracy(d_confusion_matrix))
show_percent(r'\mathrm{[Part\ I]\ d)\ error}', 100 - accuracy(d_confusion_matrix))


def scatter_matrix(X):
    m = X.mean(axis=0)
    S = np.zeros((len(m), len(m)))
    for x in X:
        S += np.outer(x - m, (x - m).T)
    return S

def within_class_scatter(data):
    S_w = np.zeros((data[0].shape[1], data[0].shape[1]))
    for c in range(len(data)):
        S_w += scatter_matrix(data[c])
    return S_w


def total_mean_vector(data):
    m = np.zeros(data[0].shape[1])
    n = 0
    for c in range(len(data)):
        n_i = len(data[c])
        m_i = data[c].mean(axis=0)
        n += n_i
        m += n_i * m_i
    m /= n
    return m

def between_class_scatter(data):
    S_B = np.zeros((data[0].shape[1], data[0].shape[1]))
    m = total_mean_vector(data)
    for c in range(len(data)):
        n_i = len(data[c])
        m_i = data[c].mean(axis=0)
        S_B += n_i * np.outer((m_i - m), (m_i - m).T)
    return S_B


def total_scatter(data):
    return within_class_scatter(data) + between_class_scatter(data)


II_a_S_W = within_class_scatter(data['train1'])


show(r'\mathrm{[Part\ II]\ a)\ \ Within-class\ scatter\ matrix}\ \ S_W', II_a_S_W)


II_a_S_W_inv = np.linalg.inv(II_a_S_W)


show(r'\mathrm{[Part\ II]\ a)\ \ Inverse\ of\ within-class\ scatter\ matrix}\ \ S_W^{-1}', II_a_S_W_inv)


II_a_S_B = between_class_scatter(data['train1'])


show(r'\mathrm{[Part\ II]\ a)\ \ Between-class\ scatter\ matrix}\ \ S_B', II_a_S_B)


II_a_eigenvalues, II_a_eigenvectors = np.linalg.eigh(II_a_S_W_inv.dot(II_a_S_B))


show(r'\mathrm{[Part\ II]\ a)\ \ Eigenvalues}\ \Lambda', II_a_eigenvalues[:, None].T)
show(r'\mathrm{[Part\ II]\ a)\ \ Eigenvectors}\ \Phi', II_a_eigenvectors)


II_a_largest_columns = II_a_eigenvalues.argsort()[-(C-1):][::-1]


show(r'\mathrm{[Part\ II]\ a)\ \ Largest}\ C-1 = 3-1 = 2\ \mathrm{eigenvalues, so\ we\ select\ the\ columns}', II_a_largest_columns[:, None].T, precision=0)


II_a_W = II_a_eigenvectors[:, II_a_largest_columns]
show(r'\mathrm{[Part\ II]\ a)\ \ Weight\ matrix}\ W', II_a_W)


x = data['train1'][0][0]
y = (x @ II_a_W)
print("x's shape is", x[:, None].shape)
print("y's shape is", y[:, None].shape)

x's shape is (5, 1)
y's shape is (2, 1)


II_a_mu = np.full((C, C-1), np.nan)
II_a_cov = np.full((C, C-1, C-1), np.nan)

for c in range(C):
    II_a_Y = data['train1'][c] @ II_a_W
    II_a_mu[c] = II_a_Y.mean(axis=0)
    II_a_cov[c] = np.cov(II_a_Y.T)


show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\mu_1^{\mathsf{T}}', II_a_mu[0][:, None].T)
show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\mu_2^{\mathsf{T}}', II_a_mu[1][:, None].T)
show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\mu_3^{\mathsf{T}}', II_a_mu[2][:, None].T)


show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\Sigma_1', II_a_cov[0])
show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\Sigma_2', II_a_cov[1])
show('\mathrm{[Part\ II]\ a)\ \ MLE\ }\Sigma_3', II_a_cov[2])


II_a_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        y = x @ II_a_W
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(y, II_a_mu[i], II_a_cov[i])
        pred_class = np.argmax(distances)
        II_a_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ II]\ b)\ \ Confusion\ matrix}', II_a_confusion_matrix)


show_percent(r'\mathrm{[Part\ II]\ b)\ accuracy}', accuracy(II_a_confusion_matrix))
show_percent(r'\mathrm{[Part\ II]\ b)\ error}', 100 - accuracy(II_a_confusion_matrix))


II_c_S_W = within_class_scatter(data['train2'])
II_c_S_W_inv = np.linalg.inv(II_c_S_W)
II_c_S_B = between_class_scatter(data['train2'])


show(r'\mathrm{[Part\ II]\ c)\ \ Within-class\ scatter\ matrix}\ \ S_W', II_c_S_W)


show(r'\mathrm{[Part\ II]\ c)\ \ Inverse\ of\ within-class\ scatter\ matrix}\ \ S_W^{-1}', II_c_S_W_inv)


show(r'\mathrm{[Part\ II]\ c)\ \ Between-class\ scatter\ matrix}\ \ S_B', II_c_S_B)


II_c_eigenvalues, II_c_eigenvectors = np.linalg.eigh(II_c_S_W_inv.dot(II_c_S_B))
II_c_largest_columns = II_c_eigenvalues.argsort()[-(C-1):][::-1]


show(r'\mathrm{[Part\ II]\ c)\ \ Eigenvalues}\ \Lambda', II_c_eigenvalues[:, None].T)
show(r'\mathrm{[Part\ II]\ c)\ \ Eigenvectors}\ \Phi', II_c_eigenvectors)


show(r'\mathrm{[Part\ II]\ c)\ \ Largest}\ C-1 = 3-1 = 2\ \mathrm{eigenvalues, so\ we\ select\ the\ columns}', II_c_largest_columns[:, None].T, precision=0)


II_c_W = II_c_eigenvectors[:, II_c_largest_columns]
show(r'\mathrm{[Part\ II]\ c)\ \ Weight\ matrix}\ W', II_c_W)


x = data['train1'][0][0]
y = (x @ II_c_W)
print("x's shape is", x[:, None].shape)
print("y's shape is", y[:, None].shape)

x's shape is (5, 1)
y's shape is (2, 1)


II_c_mu = np.full((C, C-1), np.nan)
II_c_cov = np.full((C, C-1, C-1), np.nan)

for c in range(C):
    Y = data['train2'][c] @ II_c_W
    II_c_mu[c] = Y.mean(axis=0)
    II_c_cov[c] = np.cov(Y.T)


show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\mu_1^{\mathsf{T}}', II_c_mu[0][:, None].T)
show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\mu_2^{\mathsf{T}}', II_c_mu[1][:, None].T)
show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\mu_3^{\mathsf{T}}', II_c_mu[2][:, None].T)


show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\Sigma_1', II_c_cov[0])
show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\Sigma_2', II_c_cov[1])
show('\mathrm{[Part\ II]\ c)\ \ MLE\ }\Sigma_3', II_c_cov[2])


II_c_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        y = x @ II_c_W
        distances = np.zeros(C)
        for i in range(C):
            distances[i] = discriminant_function(y, II_c_mu[i], II_c_cov[i])
        pred_class = np.argmax(distances)
        II_c_confusion_matrix[c][pred_class] += 1


show('\mathrm{[Part\ II]\ c)\ \ Confusion\ matrix}', II_c_confusion_matrix)


show_percent(r'\mathrm{[Part\ II]\ c)\ \ accuracy}', accuracy(II_c_confusion_matrix))
show_percent(r'\mathrm{[Part\ II]\ c)\ \ error}', 100 - accuracy(II_c_confusion_matrix))


def is_hypercube(kernel_fx):
    return kernel_fx == 'hypercube'

def is_gaussian(kernel_fx):
    return kernel_fx == 'gaussian'

def parzen_window(training_data, x, h_n, kernel_fx='hypercube'):
    d = x.shape[0]
    V_n = h_n ** d
    C = len(training_data)
    if is_hypercube(kernel_fx):
        p_n = np.full(C, np.nan)
    elif is_gaussian(kernel_fx):
        p_n = np.full((C, d), np.nan)
    for c in range(C):
        n = len(training_data[c])
        k = 0
        for x_i in training_data[c]:
            if is_hypercube(kernel_fx) and np.all(np.abs(x - x_i) < h_n / 2):
                k += 1
            elif is_gaussian(kernel_fx):
                k += np.exp(-0.5 * ((x - x_i) / h_n)**2) / ((np.sqrt(2 * np.pi))**d * V_n)
        p_n[c] = (1 / (n * V_n)) * k
    if is_gaussian(kernel_fx):
        p_n = p_n.sum(axis=1)
    pred_class = np.argmax(p_n)
    return pred_class


III_a_01_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train1'], x, h_n=0.1, kernel_fx='hypercube')
        III_a_01_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.1: \mathrm{Confusion\ matrix}', III_a_01_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.1: \mathrm{accuracy}', accuracy(III_a_01_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.1: \mathrm{error}', 100 - accuracy(III_a_01_confusion_matrix))


III_a_07_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train1'], x, h_n=0.7, kernel_fx='hypercube')
        III_a_07_confusion_matrix[c][pred_class] += 1
        
show(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.7: \mathrm{Confusion\ matrix}', III_a_07_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.7: \mathrm{accuracy}', accuracy(III_a_07_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=0.7: \mathrm{error}', 100 - accuracy(III_a_07_confusion_matrix))


III_a_5_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train1'], x, h_n=5, kernel_fx='hypercube')
        III_a_5_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=5: \mathrm{Confusion\ matrix}', III_a_5_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=5: \mathrm{accuracy}', accuracy(III_a_5_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ a)\ }\ \ h_n=5: \mathrm{error}', 100 - accuracy(III_a_5_confusion_matrix))


III_b_01_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=0.1, kernel_fx='hypercube')
        III_b_01_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.1: \mathrm{Confusion\ matrix}', III_b_01_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.1: \mathrm{accuracy}', accuracy(III_b_01_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.1: \mathrm{error}', 100 - accuracy(III_b_01_confusion_matrix))


III_b_07_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=0.7, kernel_fx='hypercube')
        III_b_07_confusion_matrix[c][pred_class] += 1
        
show(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.7: \mathrm{Confusion\ matrix}', III_b_07_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.7: \mathrm{accuracy}', accuracy(III_b_07_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=0.7: \mathrm{error}', 100 - accuracy(III_b_07_confusion_matrix))


III_b_5_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=5, kernel_fx='hypercube')
        III_b_5_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ III]\ b)\ }\ h_n=5: \mathrm{Confusion\ matrix}', III_b_5_confusion_matrix)


show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=5: \mathrm{accuracy}', accuracy(III_b_5_confusion_matrix))
show_percent(r'\mathrm{[Part\ III]\ b)\ }\ \ h_n=5: \mathrm{error}', 100 - accuracy(III_b_5_confusion_matrix))


III_c_01_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=0.1, kernel_fx='gaussian')
        III_c_01_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.1: \mathrm{Confusion\ matrix}', III_c_01_confusion_matrix)


show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.1: \mathrm{accuracy}', accuracy(III_c_01_confusion_matrix))
show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.1: \mathrm{error}', 100 - accuracy(III_c_01_confusion_matrix))


III_c_07_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=0.7, kernel_fx='gaussian')
        III_c_07_confusion_matrix[c][pred_class] += 1
        
show(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.7: \mathrm{Confusion\ matrix}', III_c_07_confusion_matrix)


show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.7: \mathrm{accuracy}', accuracy(III_c_07_confusion_matrix))
show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=0.7: \mathrm{error}', 100 - accuracy(III_c_07_confusion_matrix))


III_c_5_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in data['test2'][c]:
        pred_class = parzen_window(data['train2'], x, h_n=5, kernel_fx='gaussian')
        III_c_5_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[III]\ c)\ Gaussian\ kernel} \sigma=5: \mathrm{Confusion\ matrix}', III_c_5_confusion_matrix)


show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=5: \mathrm{accuracy}', accuracy(III_c_5_confusion_matrix))
show_percent(r'\mathrm{[III]\ c)\ Gaussian\ kernel}\ \sigma=5: \mathrm{error}', 100 - accuracy(III_c_5_confusion_matrix))


def squared_euclidean_distance(a, b):
    return np.sum((a - b)**2)

def k_nearest_neighbors(training_data, x, k):
    distances = []
    for c in range(len(training_data)):
        for y in training_data[c]:
            d = squared_euclidean_distance(x, y)
            distances.append((d, c))
    distances = np.array(distances)
    k_indices = np.argpartition(distances[:, 0], k)[:k]
    unique, counts = np.unique(distances[k_indices, 1].astype(int), return_counts=True)
    pred_class = unique[counts.argmax()]
    return pred_class


IV_a_training_data = np.array(data['train1'])
IV_a_N = IV_a_training_data.shape[0] * IV_a_training_data.shape[1]
IV_a_k_n = round(np.sqrt(IV_a_N))
show(r'\mathrm{[Part\ IV]\ a)}\ \ k_n', IV_a_k_n, precision=0)


IV_a_test_data = np.array(data['test2'])
IV_a_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in IV_a_test_data[c]:
        pred_class = k_nearest_neighbors(IV_a_training_data, x, k=IV_a_k_n)
        IV_a_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ IV]\ a)}\ \ \mathrm{Confusion\ matrix}', IV_a_confusion_matrix)


show_percent(r'\mathrm{[IV]\ a)}\ \ \mathrm{accuracy}', accuracy(IV_a_confusion_matrix))
show_percent(r'\mathrm{[IV]\ a)}\ \ \mathrm{error}', 100 - accuracy(IV_a_confusion_matrix))


IV_b_training_data = np.array(data['train2'])
IV_b_N = IV_b_training_data.shape[0] * IV_b_training_data.shape[1]
IV_b_k_n = round(np.sqrt(IV_b_N))
show(r'\mathrm{[Part\ IV]\ b)}\ \ k_n', IV_b_k_n, precision=0)


IV_b_test_data = np.array(data['test2'])
IV_b_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in IV_b_test_data[c]:
        pred_class = k_nearest_neighbors(IV_b_training_data, x, k=IV_b_k_n)
        IV_b_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ IV]\ b)}\ \mathrm{Confusion\ matrix}', IV_b_confusion_matrix)


show_percent(r'\mathrm{[IV]\ b)}\ \ \mathrm{accuracy}', accuracy(IV_b_confusion_matrix))
show_percent(r'\mathrm{[IV]\ b)}\ \ \mathrm{error}', 100 - accuracy(IV_b_confusion_matrix))


IV_c_k_n = round(np.sqrt(IV_b_N) / 2)
show(r'\mathrm{[Part\ IV]\ c)}\ \ k_n', IV_c_k_n, precision=0)


IV_c_confusion_matrix = np.zeros((C, C), dtype='int')

for c in range(C):
    for x in IV_b_test_data[c]:
        pred_class = k_nearest_neighbors(IV_b_training_data, x, k=IV_c_k_n)
        IV_c_confusion_matrix[c][pred_class] += 1

show(r'\mathrm{[Part\ IV]\ c)}\ \mathrm{Confusion\ matrix}', IV_c_confusion_matrix)


show_percent(r'\mathrm{[IV]\ c)}\ \ \mathrm{accuracy}', accuracy(IV_c_confusion_matrix))
show_percent(r'\mathrm{[IV]\ c)}\ \ \mathrm{error}', 100 - accuracy(IV_c_confusion_matrix))


V_a_training_data = data['train1']
V_a_test_data = data['test2']
d = len(V_a_training_data[0][0])


def perceptron_criterion(Y):
    Y = np.array(Y)
    return (-Y).sum(axis=0)


np.random.seed(7720)
V_a_weights = np.random.normal(0, 1, (C, d))


show(r'\mathrm{[Part\ V]\ a)\ \ initial\ weight\ vector\ of\ class 1\ :\ }\vec{a}_1^{\mathsf{T}}(0)', V_a_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ a)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_2^{\mathsf{T}}(0)', V_a_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ a)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_3^{\mathsf{T}}(0)', V_a_weights[2][:, None].T)


def update_weights(a, learning_rate, gradient):
    a -= learning_rate * gradient
    return a


V_a_eta = 1/2
V_a_max_epoch = 25


def perceptron_train(weights, training_data, eta, max_epoch, bias=False):
    C = len(weights)
    for epoch in range(1, max_epoch+1):
#     print(f'[Part V] training epoch {epoch}')
        for c in range(C):
            Y = []
            for x in training_data[c]:
                if bias:
                    x = np.concatenate(([1], x))
                linear_discriminant_functions = np.full(C, np.nan)
                for i in range(C):
                    linear_discriminant_functions[i] = x @ weights[i]
                pred_class = linear_discriminant_functions.argmax()
                if pred_class != c:
                    Y.append(x)
            gradient = perceptron_criterion(Y)
            eta = 1 / np.sqrt(epoch) if eta is None else eta
            weights[c] = update_weights(weights[c], eta, gradient)
    return weights


V_a_weights = perceptron_train(V_a_weights, V_a_training_data, V_a_eta, max_epoch=V_a_max_epoch)


show(r'\mathrm{[Part\ V]\ a)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 1}\ : \vec{a}_1(' + str(V_a_max_epoch) +')', V_a_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ a)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 2}\ : \vec{a}_2(' + str(V_a_max_epoch) +')', V_a_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ a)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 3}\ : \vec{a}_3(' + str(V_a_max_epoch) +')', V_a_weights[2][:, None].T)


def perceptron_predict(weights, data, bias=False):
    C = len(weights)
    cm = np.zeros((C, C), dtype='int')
    for c in range(C):
        Y = []
        for x in data[c]:
            if bias:
                x = np.concatenate(([1], x))
            linear_discriminant_functions = np.full(C, np.nan)
            for i in range(C):
                linear_discriminant_functions[i] = x @ weights[i]
            pred_class = linear_discriminant_functions.argmax()
            if pred_class != c:
                Y.append(x)
            cm[c][pred_class] += 1
    return cm


V_a_training_confusion_matrix = perceptron_predict(V_a_weights, V_a_training_data)

show(r'\mathrm{[Part\ V]\ a)}\ \mathrm{Training\ Confusion\ matrix}', V_a_training_confusion_matrix)


V_a_confusion_matrix = perceptron_predict(V_a_weights, V_a_test_data)

show(r'\mathrm{[Part\ V]\ a)}\ \mathrm{Test\ Confusion\ matrix}', V_a_confusion_matrix)


show_percent(r'\mathrm{[Part\ V]\ a)}\ \ \mathrm{Test\ accuracy}', accuracy(V_a_confusion_matrix))
show_percent(r'\mathrm{[Part\ V]\ a)}\ \ \mathrm{Test\ error}', 100 - accuracy(V_a_confusion_matrix))


np.random.seed(7720)
V_a_weights_with_bias = np.random.normal(0, 1, (C, d+1))
V_a_weights_with_bias = perceptron_train(V_a_weights_with_bias, V_a_training_data, V_a_eta, max_epoch=V_a_max_epoch, bias=True)
V_a_confusion_matrix_with_bias = perceptron_predict(V_a_weights_with_bias, V_a_test_data, bias=True)

show(r'\mathrm{[Part\ V]\ a)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_a_max_epoch) + ':\ \mathrm{Test\ Confusion\ matrix}', V_a_confusion_matrix_with_bias)
show_percent(r'\mathrm{[Part\ V]\ a)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_a_max_epoch) + ':\mathrm{Test\ accuracy}', accuracy(V_a_confusion_matrix_with_bias))


np.random.seed(7720)
V_b_weights = np.random.normal(0, 1, (C, d))
V_b_max_epoch = 25


show(r'\mathrm{[Part\ V]\ b)\ \ initial\ weight\ vector\ of\ class 1\ :\ }\vec{a}_1^{\mathsf{T}}(0)', V_b_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ b)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_2^{\mathsf{T}}(0)', V_b_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ b)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_3^{\mathsf{T}}(0)', V_b_weights[2][:, None].T)


V_b_training_data = data['train2']
V_b_test_data = data['test2']
d = len(V_b_training_data[0][0])
V_b_eta = 1/2


V_b_weights = perceptron_train(V_b_weights, V_b_training_data, V_b_eta, max_epoch=V_b_max_epoch)


V_b_training_confusion_matrix = perceptron_predict(V_b_weights, V_b_training_data)

show(r'\mathrm{[Part\ V]\ b)}\ \mathrm{Training\ Confusion\ matrix}', V_b_training_confusion_matrix)


show(r'\mathrm{[Part\ V]\ b)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 1}\ : \vec{a}_1(' + str(V_b_max_epoch) +')', V_b_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ b)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 2}\ : \vec{a}_2(' + str(V_b_max_epoch) +')', V_b_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ b)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 3}\ : \vec{a}_3(' + str(V_b_max_epoch) +')', V_b_weights[2][:, None].T)


V_b_confusion_matrix = perceptron_predict(V_b_weights, V_b_test_data)

show(r'\mathrm{[Part\ V]\ b)}\ \mathrm{Test\ Confusion\ matrix}', V_b_confusion_matrix)


show_percent(r'\mathrm{[Part\ V]\ b)}\ \ \mathrm{Test\ accuracy}', accuracy(V_b_confusion_matrix))
show_percent(r'\mathrm{[Part\ V]\ b)}\ \ \mathrm{Test\ error}', 100 - accuracy(V_b_confusion_matrix))


np.random.seed(7720)
V_b_weights_with_bias = np.random.normal(0, 1, (C, d+1))
V_b_weights_with_bias = perceptron_train(V_b_weights_with_bias, V_b_training_data, V_b_eta, max_epoch=25, bias=True)
V_b_confusion_matrix_with_bias = perceptron_predict(V_b_weights_with_bias, V_b_test_data, bias=True)

show(r'\mathrm{[Part\ V]\ b)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_b_max_epoch) + ':\ \mathrm{Test\ Confusion\ matrix}', V_b_confusion_matrix_with_bias)
show_percent(r'\mathrm{[Part\ V]\ b)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_b_max_epoch) + ':\mathrm{Test\ accuracy}', accuracy(V_b_confusion_matrix_with_bias))


np.random.seed(7720)
V_c_weights = np.random.uniform(0, 1, (C, d))
V_c_max_epoch = 25


show(r'\mathrm{[Part\ V]\ c)\ \ initial\ weight\ vector\ of\ class 1\ :\ }\vec{a}_1^{\mathsf{T}}(0)', V_c_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ c)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_2^{\mathsf{T}}(0)', V_c_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ c)\ \ initial\ weight\ vector\ of\ class 2\ :\ }\vec{a}_3^{\mathsf{T}}(0)', V_c_weights[2][:, None].T)


V_c_training_data = data['train2']
V_c_test_data = data['test2']
d = len(V_c_training_data[0][0])


V_c_weights = perceptron_train(V_c_weights, V_c_training_data, eta=None, max_epoch=V_c_max_epoch)


show(r'\mathrm{[Part\ V]\ c)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 1}\ : \vec{a}_1(' + str(V_c_max_epoch) +')', V_c_weights[0][:, None].T)
show(r'\mathrm{[Part\ V]\ c)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 2}\ : \vec{a}_2(' + str(V_c_max_epoch) +')', V_c_weights[1][:, None].T)
show(r'\mathrm{[Part\ V]\ c)}\ \ \ \mathrm{trained\ weight\ vector\ of\ class\ 3}\ : \vec{a}_3(' + str(V_c_max_epoch) +')', V_c_weights[2][:, None].T)


V_c_training_confusion_matrix = perceptron_predict(V_c_weights, V_c_training_data)

show(r'\mathrm{[Part\ V]\ c)}\ \mathrm{Training\ Confusion\ matrix}', V_c_training_confusion_matrix)


V_c_confusion_matrix = perceptron_predict(V_c_weights, V_c_test_data)

show(r'\mathrm{[Part\ V]\ c)}\ \mathrm{Test\ Confusion\ matrix}', V_c_confusion_matrix)


show_percent(r'\mathrm{[Part\ V]\ c)}\ \ K=' + str(V_c_max_epoch) +'\ :\mathrm{Test\ accuracy}', accuracy(V_c_confusion_matrix))
show_percent(r'\mathrm{[Part\ V]\ c)}\ \ K=' + str(V_c_max_epoch) +'\ :\mathrm{Test\ error}', 100 - accuracy(V_c_confusion_matrix))


np.random.seed(7720)
V_c_weights_with_bias = np.random.normal(0, 1, (C, d+1))
V_c_weights_with_bias = perceptron_train(V_c_weights_with_bias, V_c_training_data, eta=None, max_epoch=V_c_max_epoch, bias=True)
V_c_confusion_matrix_with_bias = perceptron_predict(V_c_weights_with_bias, V_c_test_data, bias=True)

show(r'\mathrm{[Part\ V]\ c)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_c_max_epoch) + ':\mathrm{Test\ Confusion\ matrix}', V_c_confusion_matrix_with_bias)
show_percent(r'\mathrm{[Part\ V]\ c)\ \ with\ bias}\ \ w_0\ ,K=' + str(V_c_max_epoch) + ':\mathrm{Test\ accuracy}', accuracy(V_c_confusion_matrix_with_bias))


for max_iteration in (10, 20, 40, 60, 80):
    np.random.seed(7720)
    V_c_vary_weights = np.random.uniform(0, 1, (C, d))
    V_c_vary_weights = perceptron_train(V_c_vary_weights, V_c_training_data, eta=None, max_epoch=max_iteration)
    V_c_vary_confusion_matrix = perceptron_predict(V_c_vary_weights, V_c_test_data)

    show(r'\mathrm{[Part\ V]\ c)}\ \ K=' + str(max_iteration) + '\ :\mathrm{Test\ Confusion\ matrix}', V_c_vary_confusion_matrix)
    show_percent(r'\mathrm{[V]\ c)}\ \ K=' + str(max_iteration) + '\ :\mathrm{Test\ accuracy}', accuracy(V_c_vary_confusion_matrix))


for max_iteration in (10, 20, 40, 60, 80):
    np.random.seed(7720)
    V_c_vary_weights_with_bias = np.random.uniform(0, 1, (C, d+1))
    V_c_vary_weights_with_bias = perceptron_train(V_c_vary_weights_with_bias, V_c_training_data, eta=None, max_epoch=max_iteration, bias=True)
    V_c_vary_confusion_matrix_with_bias = perceptron_predict(V_c_vary_weights_with_bias, V_c_test_data, bias=True)

    show(r'\mathrm{[Part\ V]\ c)\ \ with\ bias}\ \ w_0\ ,K=' + str(max_iteration) + ':\mathrm{Test\ Confusion\ matrix}', V_c_vary_confusion_matrix_with_bias)
    show_percent(r'\mathrm{[Part\ V]\ c)\ \ with\ bias}\ \ w_0\ ,K=' + str(max_iteration) + ':\mathrm{Test\ accuracy}', accuracy(V_c_vary_confusion_matrix_with_bias))

	(predicted) 1	(predicted) 2	(predicted) 3
(actual) 1	-	-	-
(actual) 2	-	-	-
(actual) 3	-	-	-

Part	Question	Algorithm	Training set	Acccuracy on Testing data I	Accuracy on Testing data II
I	a, b	BDR	A prior knowledge	99.87%	99.89%
I	c	BDR	Training data I	-	99.81%
I	d	BDR	Training data II	-	99.88%

Part	Question	Algorithm	Training set	Acccuracy on Testing data I	Accuracy on Testing data II
I	a, b	BDR	A prior knowledge	99.87%	99.89%
I	c	BDR	Training data I	-	99.81%
I	d	BDR	Training data II	-	99.88%

II	b	MDA	Training data I	-	99.51%
II	c	MDA	Training data II	-	99.59%

Part	Question	Algorithm	Training set	Acccuracy on Testing data I	Accuracy on Testing data II
I	a, b	BDR	A prior knowledge	99.87%	99.89%
I	c	BDR	Training data I	-	99.81%
I	d	BDR	Training data II	-	99.88%

II	b	MDA	Training data I	-	99.51%
II	c	MDA	Training data II	-	99.59%

III	a	Parzen Window (Hypercube $h_n=0.1$)	Training data I	-	33.33%
III	a	Parzen Window (Hypercube $h_n=0.7$)	Training data I	-	33.39%
III	a	Parzen Window (Hypercube $h_n=5.0$)	Training data I	-	94.86%

III	b	Parzen Window (Hypercube $h_n=0.1$)	Training data II	-	33.33%
III	b	Parzen Window (Hypercube $h_n=0.7$)	Training data II	-	34.19%
III	b	Parzen Window (Hypercube $h_n=5.0$)	Training data II	-	99.16%

III	c	Parzen Window (Gaussian $\sigma=0.1$)	Training data II	-	93.14%
III	c	Parzen Window (Gaussian $\sigma=0.7$)	Training data II	-	94.83%
III	c	Parzen Window (Gaussian $\sigma=5.0$)	Training data II	-	96.73%

Part	Question	Algorithm	Training set	Acccuracy on Testing data I	Accuracy on Testing data II
I	a, b	BDR	A prior knowledge	99.87%	99.89%
I	c	BDR	Training data I	-	99.81%
I	d	BDR	Training data II	-	99.88%

II	b	MDA	Training data I	-	99.51%
II	c	MDA	Training data II	-	99.59%

III	a	Parzen Window (Hypercube $h_n=0.1$)	Training data I	-	33.33%
III	a	Parzen Window (Hypercube $h_n=0.7$)	Training data I	-	33.39%
III	a	Parzen Window (Hypercube $h_n=5.0$)	Training data I	-	94.86%
III	b	Parzen Window (Hypercube $h_n=0.1$)	Training data II	-	33.33%
III	b	Parzen Window (Hypercube $h_n=0.7$)	Training data II	-	34.19%
III	b	Parzen Window (Hypercube $h_n=5.0$)	Training data II	-	99.16%
III	c	Parzen Window (Gaussian $\sigma=0.1$)	Training data II	-	93.14%
III	c	Parzen Window (Gaussian $\sigma=0.7$)	Training data II	-	94.83%
III	c	Parzen Window (Gaussian $\sigma=5.0$)	Training data II	-	96.73%

IV	a	$k_n$-NN ($k_n = \sqrt{n}$)	Training data I	-	98.59%
IV	b	$k_n$-NN ($k_n = \sqrt{n}$)	Training data II	-	99.39%
IV	c	$k_n$-NN ($k_n = \frac{\sqrt{n}}{2}$)	Training data II	-	99.58%

Generate dataset and save it to 'data/mp6.npy' file¶

Load dataset from 'data/mp6.npy' file¶

Confusion Matrix¶

Part I¶

a) Classify the Testing Data I, using the given statistics above and the Bayes decision rule. Compute the confusion matrix.¶

b) Repeat part a) for the Testing Data II.¶

c) Compute the ML estimates $(\hat{\vec{\mu}}_i$ and $\hat{\Sigma}_i$) for each class using the Training Data I and classify the Testing Data II using Bayes decision rule. Compute the confusion matrix.¶

d) Repeat part c), but this time use the Training Data II for the ML and then classify the Testing Data II again.¶

e) Make comments on each of the results above and then compare them (’compare’ does not mean to say “this was better than that”, but to say why that was the case).¶

Part II¶

a) Using MDA on the Training Data I, find the matrix W such that $\vec{y}$ = $W\vec{x}$. (What is the expected dimension of $\vec{y}$?). For each class, compute the $\hat{\vec{\mu}}_i$ and $\Sigma_i$ of the new reduced-space variable $\vec{y}$.¶

b) Apply the transformation W above to the Testing Data II and classify it using Bayes decision rule. Compute the confusion matrix.¶

c) Repeat parts a) above, but this time use the Training Data II to compute the matrix W and the $(\hat{\vec{\mu}}_i$ and $\Sigma_i$ for each class of the new reduced-space variable $\vec{y}$. Then classify the Testing Data II again. Compute the confusion matrix.¶

d) Comment on the results from this Part II and then compare these results with the results from the previous Part I.¶

Part III¶

b) Repeat part a), but this time use the Training Data II for the Parzen Window and classify the same Testing Data II with, again, $h_n = 0.1$, $h_n = 0.7$, and $h_n = 5$.¶

c) Repeat part b) using a Gaussian kernel (a Gaussian window function) with $\sigma = 0.1$. Then repeat this part with $\sigma = 0.7$ and $\sigma = 5$.¶

d) Comment on the results above. Compare them with the results from the previous Parts above (I and II).¶

Part IV¶

a) Using the Training Data I, you will classify the Testing Data II using k-Nearest-Neighbour. Use $k_n = \sqrt{n}$.¶

b) Repeat part a), but this time use the Training Data II to classify the Testing Data II.¶

c) Repeat part b) using your choice for $k_n = f(n)$.¶

d) Comment on the results above. Compare them with the results from the previous Parts above, in special Part III.¶

Part V¶

a) Using the Training Data I, you will classify the Testing Data II using the Percetron criterion. Use $\eta = \frac{1}{2}$.¶

b) Repeat part a), but this time use the Training Data II to classify the Testing Data II.¶

c) Repeat part b) using $\eta = \frac{1}{\sqrt{k}}$.¶

d) Comment on the results above. Compare them with the results from the previous Parts above.¶