Commit 57883c9873c990fa6dbb023f775c4b7ea54a9db9
1 parent
5eb3a27646
Exists in
master
Implementation of kmeans with gaussian
Showing 1 changed file with 124 additions and 0 deletions Inline Diff
bin/cluster_kmeans_gaussianML.py
File was created | 1 | ''' | |
2 | Un petit test pour faire du clustering | ||
3 | avec une distance de mahalanobis | ||
4 | From paper: | ||
5 | Convergence problems of Mahalanobis distance-based k-means clustering, | ||
6 | Itshak Lapidot | ||
7 | |||
8 | Just one thing: Column and lines are inversed in this script. | ||
9 | TODO: Random selection from the set | ||
10 | ''' | ||
11 | |||
12 | import matplotlib.pyplot as plt | ||
13 | import numpy as np | ||
14 | # from sklearn.manifold import TSNE | ||
15 | |||
16 | N = 50 # Number of individus | ||
17 | d = 2 # Number of dimensions | ||
18 | K = 3 # number of clusters | ||
19 | |||
20 | |||
21 | def initialize_model(X, number_clusters): | ||
22 | rand = np.random.choice(X.shape[0], number_clusters) | ||
23 | print(rand) | ||
24 | C = X[rand] | ||
25 | L = np.zeros((K, d, d)) | ||
26 | for k in range(K): | ||
27 | L[k] = np.identity(d) | ||
28 | return C, L | ||
29 | |||
30 | |||
31 | X = np.random.rand(N, d) # Features | ||
32 | C, L = initialize_model(X, K) | ||
33 | |||
34 | |||
35 | def gaussian(x, c, l): | ||
36 | ''' | ||
37 | G function | ||
38 | ''' | ||
39 | p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2) | ||
40 | p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c)) | ||
41 | return (p1 * p2).reshape(1) | ||
42 | |||
43 | |||
44 | def dist(a, b, l): | ||
45 | ''' | ||
46 | Distance euclidienne | ||
47 | ''' | ||
48 | a = np.reshape(a, (-1, 1)) | ||
49 | b = np.reshape(b, (-1, 1)) | ||
50 | return np.log(gaussian(a, b, l)) | ||
51 | |||
52 | |||
53 | def plot_iteration(iteration, points, clusters, centers): | ||
54 | fig = plt.figure() | ||
55 | ax = fig.add_subplot(111) | ||
56 | scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) | ||
57 | for i, j in centers: | ||
58 | ax.scatter(i, j, s=50, c='red', marker='+') | ||
59 | ax.set_xlabel('x') | ||
60 | ax.set_ylabel('y') | ||
61 | plt.colorbar(scatter) | ||
62 | plt.ylim(0, 1) | ||
63 | plt.xlim(0, 1) | ||
64 | plt.savefig("test_" + str(iteration) + ".pdf") | ||
65 | |||
66 | |||
67 | end_algo = False | ||
68 | i = 0 | ||
69 | while not end_algo: | ||
70 | if i == 10: | ||
71 | exit(1) | ||
72 | print("Iteration: ", i) | ||
73 | # -- Calcul matrix distance | ||
74 | distances = np.zeros((N, K)) | ||
75 | |||
76 | # -- Calcul closest cluster | ||
77 | for n in range(N): | ||
78 | for k in range(K): | ||
79 | distances[n][k] = dist(X[n], C[k], L[k]) | ||
80 | closest_cluster = np.argmin(distances, axis=1) | ||
81 | |||
82 | # -- Debug tool ---------------------- | ||
83 | if i % 1 == 0: | ||
84 | # TSNE | ||
85 | X_embedded = np.concatenate((X, C), axis=0) | ||
86 | # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | ||
87 | # Then plot | ||
88 | plot_iteration( | ||
89 | i, | ||
90 | X_embedded[:X.shape[0]], | ||
91 | closest_cluster, | ||
92 | X_embedded[X.shape[0]:] | ||
93 | ) | ||
94 | # ------------------------------------ | ||
95 | |||
96 | end_algo = True | ||
97 | for k in range(K): | ||
98 | # Find subset of X with values closed to the centroid c_k. | ||
99 | X_sub = np.where(closest_cluster == k) | ||
100 | X_sub = np.take(X, X_sub[0], axis=0) | ||
101 | np.mean(X_sub, axis=0) | ||
102 | C_new = np.mean(X_sub, axis=0) | ||
103 | |||
104 | # -- COMPUTE NEW LAMBDA (here named K) -- | ||
105 | K_new = np.zeros((L.shape[1], L.shape[2])) | ||
106 | for x in X_sub: | ||
107 | x = np.reshape(x, (-1, 1)) | ||
108 | c_tmp = np.reshape(C_new, (-1, 1)) | ||
109 | K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) | ||
110 | K_new = K_new / X_sub.shape[0] | ||
111 | K_new = np.linalg.inv(K_new) | ||
112 | |||
113 | if end_algo and (not (C[k] == C_new).all()): # If the same stop | ||
114 | end_algo = False | ||
115 | C[k] = C_new | ||
116 | L[k] = K_new | ||
117 | i = i + 1 | ||
118 | |||
119 | plot_iteration( | ||
120 | i, | ||
121 | X_embedded[:X.shape[0]], | ||
122 | closest_cluster, | ||
123 | X_embedded[X.shape[0]:] | ||
124 | ) | ||
125 |