From 5eb3a27646632055665ec5fde59add00dbe7547a Mon Sep 17 00:00:00 2001 From: Mathias Quillot Date: Wed, 11 Sep 2019 22:11:48 +0200 Subject: [PATCH] Implementation of kmeans with mahalanobis distance --- bin/cluster_kmeans_mahalanobis.py | 113 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 bin/cluster_kmeans_mahalanobis.py diff --git a/bin/cluster_kmeans_mahalanobis.py b/bin/cluster_kmeans_mahalanobis.py new file mode 100644 index 0000000..96e6921 --- /dev/null +++ b/bin/cluster_kmeans_mahalanobis.py @@ -0,0 +1,113 @@ +''' +Un petit test pour faire du clustering +avec une distance de mahalanobis +From paper: +Convergence problems of Mahalanobis distance-based k-means clustering + +Just one thing: Column and lines are inversed in this script. +''' + +import matplotlib.pyplot as plt +import numpy as np +# from sklearn.manifold import TSNE + + +N = 50 # Number of individus +d = 2 # Number of dimensions +K = 3 # number of clusters + + +def initialize_model(X, number_clusters): + C = X[np.random.choice(X.shape[0], number_clusters)] + L = np.zeros((K, d, d)) + for k in range(K): + L[k] = np.identity(d) + return C, L + + +X = np.random.rand(N, d) # Features + +C, L = initialize_model(X, K) + + +def dist(a, b, l): + ''' + Distance euclidienne + ''' + a = np.reshape(a, (-1, 1)) + b = np.reshape(b, (-1, 1)) + result = np.transpose(a - b).dot(l).dot(a-b)[0][0] + return result + + +def plot_iteration(iteration, points, clusters, centers): + fig = plt.figure() + ax = fig.add_subplot(111) + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) + for i, j in centers: + ax.scatter(i, j, s=50, c='red', marker='+') + ax.set_xlabel('x') + ax.set_ylabel('y') + plt.colorbar(scatter) + plt.ylim(0, 1) + plt.xlim(0, 1) + plt.savefig("test_" + str(iteration) + ".pdf") + + +end_algo = False +i = 0 +while not end_algo: + if i == 10: + exit(1) + print("Iteration: ", i) + # Calcul matrix distance + distances = np.zeros((N, K)) + + for n in range(N): + for k in range(K): + distances[n][k] = dist(X[n], C[k], L[k]) + print(distances) + closest_cluster = np.argmin(distances, axis=1) + if i % 1 == 0: + # -- Debug tool ---------------------- + # TSNE + X_embedded = np.concatenate((X, C), axis=0) + # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + # Then plot + plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] + ) + # ------------------------------------ + + end_algo = True + for k in range(K): + # Find subset of X with values closed to the centroid c_k. + X_sub = np.where(closest_cluster == k) + X_sub = np.take(X, X_sub[0], axis=0) + np.mean(X_sub, axis=0) + C_new = np.mean(X_sub, axis=0) + + # -- COMPUTE NEW LAMBDA (here named K) -- + K_new = np.zeros((L.shape[1], L.shape[2])) + for x in X_sub: + x = np.reshape(x, (-1, 1)) + c_tmp = np.reshape(C_new, (-1, 1)) + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) + K_new = K_new / X_sub.shape[0] + K_new = np.linalg.inv(K_new) + + if end_algo and (not (C[k] == C_new).all()): # If the same stop + end_algo = False + C[k] = C_new + L[k] = K_new + i = i + 1 + +plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] +) -- 1.8.2.3