cluster_kmeans_mahalanobis.py
2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'''
Un petit test pour faire du clustering
avec une distance de mahalanobis
From paper:
Convergence problems of Mahalanobis distance-based k-means clustering
Just one thing: Column and lines are inversed in this script.
'''
import matplotlib.pyplot as plt
import numpy as np
# from sklearn.manifold import TSNE
N = 50 # Number of individus
d = 2 # Number of dimensions
K = 3 # number of clusters
def initialize_model(X, number_clusters):
C = X[np.random.choice(X.shape[0], number_clusters)]
L = np.zeros((K, d, d))
for k in range(K):
L[k] = np.identity(d)
return C, L
X = np.random.rand(N, d) # Features
C, L = initialize_model(X, K)
def dist(a, b, l):
'''
Distance euclidienne
'''
a = np.reshape(a, (-1, 1))
b = np.reshape(b, (-1, 1))
result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
return result
def plot_iteration(iteration, points, clusters, centers):
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
for i, j in centers:
ax.scatter(i, j, s=50, c='red', marker='+')
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.colorbar(scatter)
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.savefig("test_" + str(iteration) + ".pdf")
end_algo = False
i = 0
while not end_algo:
if i == 10:
exit(1)
print("Iteration: ", i)
# Calcul matrix distance
distances = np.zeros((N, K))
for n in range(N):
for k in range(K):
distances[n][k] = dist(X[n], C[k], L[k])
print(distances)
closest_cluster = np.argmin(distances, axis=1)
if i % 1 == 0:
# -- Debug tool ----------------------
# TSNE
X_embedded = np.concatenate((X, C), axis=0)
# X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
# Then plot
plot_iteration(
i,
X_embedded[:X.shape[0]],
closest_cluster,
X_embedded[X.shape[0]:]
)
# ------------------------------------
end_algo = True
for k in range(K):
# Find subset of X with values closed to the centroid c_k.
X_sub = np.where(closest_cluster == k)
X_sub = np.take(X, X_sub[0], axis=0)
np.mean(X_sub, axis=0)
C_new = np.mean(X_sub, axis=0)
# -- COMPUTE NEW LAMBDA (here named K) --
K_new = np.zeros((L.shape[1], L.shape[2]))
for x in X_sub:
x = np.reshape(x, (-1, 1))
c_tmp = np.reshape(C_new, (-1, 1))
K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
K_new = K_new / X_sub.shape[0]
K_new = np.linalg.inv(K_new)
if end_algo and (not (C[k] == C_new).all()): # If the same stop
end_algo = False
C[k] = C_new
L[k] = K_new
i = i + 1
plot_iteration(
i,
X_embedded[:X.shape[0]],
closest_cluster,
X_embedded[X.shape[0]:]
)