### Blame view

bin/cluster_kmeans_gaussianML.py 3.17 KB
 ```1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124``` ``` ''' Un petit test pour faire du clustering avec une distance de mahalanobis From paper: Convergence problems of Mahalanobis distance-based k-means clustering, Itshak Lapidot Just one thing: Column and lines are inversed in this script. TODO: Random selection from the set ''' import matplotlib.pyplot as plt import numpy as np # from sklearn.manifold import TSNE N = 50 # Number of individus d = 2 # Number of dimensions K = 3 # number of clusters def initialize_model(X, number_clusters): rand = np.random.choice(X.shape[0], number_clusters) print(rand) C = X[rand] L = np.zeros((K, d, d)) for k in range(K): L[k] = np.identity(d) return C, L X = np.random.rand(N, d) # Features C, L = initialize_model(X, K) def gaussian(x, c, l): ''' G function ''' p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2) p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c)) return (p1 * p2).reshape(1) def dist(a, b, l): ''' Distance euclidienne ''' a = np.reshape(a, (-1, 1)) b = np.reshape(b, (-1, 1)) return np.log(gaussian(a, b, l)) def plot_iteration(iteration, points, clusters, centers): fig = plt.figure() ax = fig.add_subplot(111) scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) for i, j in centers: ax.scatter(i, j, s=50, c='red', marker='+') ax.set_xlabel('x') ax.set_ylabel('y') plt.colorbar(scatter) plt.ylim(0, 1) plt.xlim(0, 1) plt.savefig("test_" + str(iteration) + ".pdf") end_algo = False i = 0 while not end_algo: if i == 10: exit(1) print("Iteration: ", i) # -- Calcul matrix distance distances = np.zeros((N, K)) # -- Calcul closest cluster for n in range(N): for k in range(K): distances[n][k] = dist(X[n], C[k], L[k]) closest_cluster = np.argmin(distances, axis=1) # -- Debug tool ---------------------- if i % 1 == 0: # TSNE X_embedded = np.concatenate((X, C), axis=0) # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) # Then plot plot_iteration( i, X_embedded[:X.shape[0]], closest_cluster, X_embedded[X.shape[0]:] ) # ------------------------------------ end_algo = True for k in range(K): # Find subset of X with values closed to the centroid c_k. X_sub = np.where(closest_cluster == k) X_sub = np.take(X, X_sub[0], axis=0) np.mean(X_sub, axis=0) C_new = np.mean(X_sub, axis=0) # -- COMPUTE NEW LAMBDA (here named K) -- K_new = np.zeros((L.shape[1], L.shape[2])) for x in X_sub: x = np.reshape(x, (-1, 1)) c_tmp = np.reshape(C_new, (-1, 1)) K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) K_new = K_new / X_sub.shape[0] K_new = np.linalg.inv(K_new) if end_algo and (not (C[k] == C_new).all()): # If the same stop end_algo = False C[k] = C_new L[k] = K_new i = i + 1 plot_iteration( i, X_embedded[:X.shape[0]], closest_cluster, X_embedded[X.shape[0]:] ) ```