Commit cce036f22feef37d6c05c4b0757afbcbc15de45c

Authored by Mathias Quillot
1 parent a79344696d
Exists in master

Implementation of basic kmeans made by my hand

Showing 1 changed file with 85 additions and 0 deletions Side-by-side Diff

bin/cluster_kmeans_ownmade.py
  1 +'''
  2 +Un petit test pour faire du clustering
  3 +avec une distance de mahalanobis
  4 +'''
  5 +
  6 +import matplotlib.pyplot as plt
  7 +import numpy as np
  8 +from sklearn.manifold import TSNE
  9 +
  10 +N = 18 # Number of individus
  11 +d = 2 # Number of dimensions
  12 +K = 3 # number of clusters
  13 +
  14 +X = np.random.rand(N, d) # Features
  15 +
  16 +C = np.random.random_sample((K, d)) # Model 0
  17 +
  18 +
  19 +def dist(a, b):
  20 + '''
  21 + Distance euclidienne
  22 + '''
  23 + return np.sum(np.power(np.abs(a - b), 2))
  24 +
  25 +
  26 +def plot_iteration(iteration, points, clusters, centers):
  27 + fig = plt.figure()
  28 + ax = fig.add_subplot(111)
  29 + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
  30 + for i, j in centers:
  31 + ax.scatter(i, j, s=50, c='red', marker='+')
  32 + ax.set_xlabel('x')
  33 + ax.set_ylabel('y')
  34 + plt.colorbar(scatter)
  35 + plt.ylim(0, 1)
  36 + plt.xlim(0, 1)
  37 + plt.savefig("test_" + str(iteration) + ".pdf")
  38 +
  39 +
  40 +end_algo = False
  41 +i = 0
  42 +while not end_algo:
  43 + if i == 2000:
  44 + exit(1)
  45 + print("Iteration: ", i)
  46 + # Calcul matrix distance
  47 + distances = np.zeros((N, K))
  48 +
  49 + for n in range(N):
  50 + for k in range(K):
  51 + distances[n][k] = dist(X[n], C[k])
  52 + closest_cluster = np.argmin(distances, axis=1)
  53 +
  54 + if i % 1 == 0:
  55 + # -- Debug tool ----------------------
  56 + # TSNE
  57 + X_embedded = np.concatenate((X, C), axis=0)
  58 + # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
  59 + # Then plot
  60 + plot_iteration(
  61 + i,
  62 + X_embedded[:X.shape[0]],
  63 + closest_cluster,
  64 + X_embedded[X.shape[0]:]
  65 + )
  66 + # ------------------------------------
  67 +
  68 + end_algo = True
  69 + for k in range(K):
  70 + # Find subset of X with values closed to the centroid c_k.
  71 + X_sub = np.where(closest_cluster == k)
  72 + X_sub = np.take(X, X_sub[0], axis=0)
  73 + np.mean(X_sub, axis=0)
  74 + C_new = np.mean(X_sub, axis=0)
  75 + if end_algo and (not (C[k] == C_new).all()): # If the same stop
  76 + end_algo = False
  77 + C[k] = C_new
  78 + i = i + 1
  79 +
  80 +plot_iteration(
  81 + i,
  82 + X_embedded[:X.shape[0]],
  83 + closest_cluster,
  84 + X_embedded[X.shape[0]:]
  85 +)