cluster_kmeans_gaussianML.py 3.17 KB
'''
Un petit test pour faire du clustering
avec une distance de mahalanobis
From paper:
Convergence problems of Mahalanobis distance-based k-means clustering,
Itshak Lapidot

Just one thing: Column and lines are inversed in this script.
TODO: Random selection from the set
'''

import matplotlib.pyplot as plt
import numpy as np
# from sklearn.manifold import TSNE

N = 50  # Number of individus
d = 2  # Number of dimensions
K = 3  # number of clusters


def initialize_model(X, number_clusters):
    rand = np.random.choice(X.shape[0], number_clusters)
    print(rand)
    C = X[rand]
    L = np.zeros((K, d, d))
    for k in range(K):
        L[k] = np.identity(d)
    return C, L


X = np.random.rand(N, d)  # Features
C, L = initialize_model(X, K)


def gaussian(x, c, l):
    '''
    G function
    '''
    p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
    p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
    return (p1 * p2).reshape(1)


def dist(a, b, l):
    '''
    Distance euclidienne
    '''
    a = np.reshape(a, (-1, 1))
    b = np.reshape(b, (-1, 1))
    return np.log(gaussian(a, b, l))


def plot_iteration(iteration, points, clusters, centers):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
    for i, j in centers:
        ax.scatter(i, j, s=50, c='red', marker='+')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.colorbar(scatter)
    plt.ylim(0, 1)
    plt.xlim(0, 1)
    plt.savefig("test_" + str(iteration) + ".pdf")


end_algo = False
i = 0
while not end_algo:
    if i == 10:
        exit(1)
    print("Iteration: ", i)
    # -- Calcul matrix distance
    distances = np.zeros((N, K))

    # -- Calcul closest cluster
    for n in range(N):
        for k in range(K):
            distances[n][k] = dist(X[n], C[k], L[k])
    closest_cluster = np.argmin(distances, axis=1)

    # -- Debug tool ----------------------
    if i % 1 == 0:
        # TSNE
        X_embedded = np.concatenate((X, C), axis=0)
        # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
        # Then plot
        plot_iteration(
            i,
            X_embedded[:X.shape[0]],
            closest_cluster,
            X_embedded[X.shape[0]:]
            )
    # ------------------------------------

    end_algo = True
    for k in range(K):
        # Find subset of X with values closed to the centroid c_k.
        X_sub = np.where(closest_cluster == k)
        X_sub = np.take(X, X_sub[0], axis=0)
        np.mean(X_sub, axis=0)
        C_new = np.mean(X_sub, axis=0)

        # -- COMPUTE NEW LAMBDA (here named K) --
        K_new = np.zeros((L.shape[1], L.shape[2]))
        for x in X_sub:
            x = np.reshape(x, (-1, 1))
            c_tmp = np.reshape(C_new, (-1, 1))
            K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
        K_new = K_new / X_sub.shape[0]
        K_new = np.linalg.inv(K_new)

        if end_algo and (not (C[k] == C_new).all()):  # If the same stop
            end_algo = False
        C[k] = C_new
        L[k] = K_new
    i = i + 1

plot_iteration(
    i,
    X_embedded[:X.shape[0]],
    closest_cluster,
    X_embedded[X.shape[0]:]
)