From cce036f22feef37d6c05c4b0757afbcbc15de45c Mon Sep 17 00:00:00 2001 From: Mathias Quillot Date: Wed, 11 Sep 2019 22:11:18 +0200 Subject: [PATCH] Implementation of basic kmeans made by my hand --- bin/cluster_kmeans_ownmade.py | 85 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 bin/cluster_kmeans_ownmade.py diff --git a/bin/cluster_kmeans_ownmade.py b/bin/cluster_kmeans_ownmade.py new file mode 100644 index 0000000..f5dbe38 --- /dev/null +++ b/bin/cluster_kmeans_ownmade.py @@ -0,0 +1,85 @@ +''' +Un petit test pour faire du clustering +avec une distance de mahalanobis +''' + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.manifold import TSNE + +N = 18 # Number of individus +d = 2 # Number of dimensions +K = 3 # number of clusters + +X = np.random.rand(N, d) # Features + +C = np.random.random_sample((K, d)) # Model 0 + + +def dist(a, b): + ''' + Distance euclidienne + ''' + return np.sum(np.power(np.abs(a - b), 2)) + + +def plot_iteration(iteration, points, clusters, centers): + fig = plt.figure() + ax = fig.add_subplot(111) + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) + for i, j in centers: + ax.scatter(i, j, s=50, c='red', marker='+') + ax.set_xlabel('x') + ax.set_ylabel('y') + plt.colorbar(scatter) + plt.ylim(0, 1) + plt.xlim(0, 1) + plt.savefig("test_" + str(iteration) + ".pdf") + + +end_algo = False +i = 0 +while not end_algo: + if i == 2000: + exit(1) + print("Iteration: ", i) + # Calcul matrix distance + distances = np.zeros((N, K)) + + for n in range(N): + for k in range(K): + distances[n][k] = dist(X[n], C[k]) + closest_cluster = np.argmin(distances, axis=1) + + if i % 1 == 0: + # -- Debug tool ---------------------- + # TSNE + X_embedded = np.concatenate((X, C), axis=0) + # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + # Then plot + plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] + ) + # ------------------------------------ + + end_algo = True + for k in range(K): + # Find subset of X with values closed to the centroid c_k. + X_sub = np.where(closest_cluster == k) + X_sub = np.take(X, X_sub[0], axis=0) + np.mean(X_sub, axis=0) + C_new = np.mean(X_sub, axis=0) + if end_algo and (not (C[k] == C_new).all()): # If the same stop + end_algo = False + C[k] = C_new + i = i + 1 + +plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] +) -- 1.8.2.3