cluster_kmeans_gaussianML.py
3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'''
Un petit test pour faire du clustering
avec une distance de mahalanobis
From paper:
Convergence problems of Mahalanobis distance-based k-means clustering,
Itshak Lapidot
Just one thing: Column and lines are inversed in this script.
TODO: Random selection from the set
'''
import matplotlib.pyplot as plt
import numpy as np
# from sklearn.manifold import TSNE
N = 50 # Number of individus
d = 2 # Number of dimensions
K = 3 # number of clusters
def initialize_model(X, number_clusters):
rand = np.random.choice(X.shape[0], number_clusters)
print(rand)
C = X[rand]
L = np.zeros((K, d, d))
for k in range(K):
L[k] = np.identity(d)
return C, L
X = np.random.rand(N, d) # Features
C, L = initialize_model(X, K)
def gaussian(x, c, l):
'''
G function
'''
p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
return (p1 * p2).reshape(1)
def dist(a, b, l):
'''
Distance euclidienne
'''
a = np.reshape(a, (-1, 1))
b = np.reshape(b, (-1, 1))
return np.log(gaussian(a, b, l))
def plot_iteration(iteration, points, clusters, centers):
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
for i, j in centers:
ax.scatter(i, j, s=50, c='red', marker='+')
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.colorbar(scatter)
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.savefig("test_" + str(iteration) + ".pdf")
end_algo = False
i = 0
while not end_algo:
if i == 10:
exit(1)
print("Iteration: ", i)
# -- Calcul matrix distance
distances = np.zeros((N, K))
# -- Calcul closest cluster
for n in range(N):
for k in range(K):
distances[n][k] = dist(X[n], C[k], L[k])
closest_cluster = np.argmin(distances, axis=1)
# -- Debug tool ----------------------
if i % 1 == 0:
# TSNE
X_embedded = np.concatenate((X, C), axis=0)
# X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
# Then plot
plot_iteration(
i,
X_embedded[:X.shape[0]],
closest_cluster,
X_embedded[X.shape[0]:]
)
# ------------------------------------
end_algo = True
for k in range(K):
# Find subset of X with values closed to the centroid c_k.
X_sub = np.where(closest_cluster == k)
X_sub = np.take(X, X_sub[0], axis=0)
np.mean(X_sub, axis=0)
C_new = np.mean(X_sub, axis=0)
# -- COMPUTE NEW LAMBDA (here named K) --
K_new = np.zeros((L.shape[1], L.shape[2]))
for x in X_sub:
x = np.reshape(x, (-1, 1))
c_tmp = np.reshape(C_new, (-1, 1))
K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
K_new = K_new / X_sub.shape[0]
K_new = np.linalg.inv(K_new)
if end_algo and (not (C[k] == C_new).all()): # If the same stop
end_algo = False
C[k] = C_new
L[k] = K_new
i = i + 1
plot_iteration(
i,
X_embedded[:X.shape[0]],
closest_cluster,
X_embedded[X.shape[0]:]
)