Commit 57883c9873c990fa6dbb023f775c4b7ea54a9db9

Authored by Mathias Quillot
1 parent 5eb3a27646
Exists in master

Implementation of kmeans with gaussian

Showing 1 changed file with 124 additions and 0 deletions Inline Diff

bin/cluster_kmeans_gaussianML.py
File was created 1 '''
2 Un petit test pour faire du clustering
3 avec une distance de mahalanobis
4 From paper:
5 Convergence problems of Mahalanobis distance-based k-means clustering,
6 Itshak Lapidot
7
8 Just one thing: Column and lines are inversed in this script.
9 TODO: Random selection from the set
10 '''
11
12 import matplotlib.pyplot as plt
13 import numpy as np
14 # from sklearn.manifold import TSNE
15
16 N = 50 # Number of individus
17 d = 2 # Number of dimensions
18 K = 3 # number of clusters
19
20
21 def initialize_model(X, number_clusters):
22 rand = np.random.choice(X.shape[0], number_clusters)
23 print(rand)
24 C = X[rand]
25 L = np.zeros((K, d, d))
26 for k in range(K):
27 L[k] = np.identity(d)
28 return C, L
29
30
31 X = np.random.rand(N, d) # Features
32 C, L = initialize_model(X, K)
33
34
35 def gaussian(x, c, l):
36 '''
37 G function
38 '''
39 p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
40 p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
41 return (p1 * p2).reshape(1)
42
43
44 def dist(a, b, l):
45 '''
46 Distance euclidienne
47 '''
48 a = np.reshape(a, (-1, 1))
49 b = np.reshape(b, (-1, 1))
50 return np.log(gaussian(a, b, l))
51
52
53 def plot_iteration(iteration, points, clusters, centers):
54 fig = plt.figure()
55 ax = fig.add_subplot(111)
56 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
57 for i, j in centers:
58 ax.scatter(i, j, s=50, c='red', marker='+')
59 ax.set_xlabel('x')
60 ax.set_ylabel('y')
61 plt.colorbar(scatter)
62 plt.ylim(0, 1)
63 plt.xlim(0, 1)
64 plt.savefig("test_" + str(iteration) + ".pdf")
65
66
67 end_algo = False
68 i = 0
69 while not end_algo:
70 if i == 10:
71 exit(1)
72 print("Iteration: ", i)
73 # -- Calcul matrix distance
74 distances = np.zeros((N, K))
75
76 # -- Calcul closest cluster
77 for n in range(N):
78 for k in range(K):
79 distances[n][k] = dist(X[n], C[k], L[k])
80 closest_cluster = np.argmin(distances, axis=1)
81
82 # -- Debug tool ----------------------
83 if i % 1 == 0:
84 # TSNE
85 X_embedded = np.concatenate((X, C), axis=0)
86 # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
87 # Then plot
88 plot_iteration(
89 i,
90 X_embedded[:X.shape[0]],
91 closest_cluster,
92 X_embedded[X.shape[0]:]
93 )
94 # ------------------------------------
95
96 end_algo = True
97 for k in range(K):
98 # Find subset of X with values closed to the centroid c_k.
99 X_sub = np.where(closest_cluster == k)
100 X_sub = np.take(X, X_sub[0], axis=0)
101 np.mean(X_sub, axis=0)
102 C_new = np.mean(X_sub, axis=0)
103
104 # -- COMPUTE NEW LAMBDA (here named K) --
105 K_new = np.zeros((L.shape[1], L.shape[2]))
106 for x in X_sub:
107 x = np.reshape(x, (-1, 1))
108 c_tmp = np.reshape(C_new, (-1, 1))
109 K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
110 K_new = K_new / X_sub.shape[0]
111 K_new = np.linalg.inv(K_new)
112
113 if end_algo and (not (C[k] == C_new).all()): # If the same stop
114 end_algo = False
115 C[k] = C_new
116 L[k] = K_new
117 i = i + 1
118
119 plot_iteration(
120 i,
121 X_embedded[:X.shape[0]],
122 closest_cluster,
123 X_embedded[X.shape[0]:]
124 )
125