Commit 57883c9873c990fa6dbb023f775c4b7ea54a9db9

Authored by Mathias Quillot
1 parent 5eb3a27646
Exists in master

Implementation of kmeans with gaussian

Showing 1 changed file with 124 additions and 0 deletions Side-by-side Diff

bin/cluster_kmeans_gaussianML.py
  1 +'''
  2 +Un petit test pour faire du clustering
  3 +avec une distance de mahalanobis
  4 +From paper:
  5 +Convergence problems of Mahalanobis distance-based k-means clustering,
  6 +Itshak Lapidot
  7 +
  8 +Just one thing: Column and lines are inversed in this script.
  9 +TODO: Random selection from the set
  10 +'''
  11 +
  12 +import matplotlib.pyplot as plt
  13 +import numpy as np
  14 +# from sklearn.manifold import TSNE
  15 +
  16 +N = 50 # Number of individus
  17 +d = 2 # Number of dimensions
  18 +K = 3 # number of clusters
  19 +
  20 +
  21 +def initialize_model(X, number_clusters):
  22 + rand = np.random.choice(X.shape[0], number_clusters)
  23 + print(rand)
  24 + C = X[rand]
  25 + L = np.zeros((K, d, d))
  26 + for k in range(K):
  27 + L[k] = np.identity(d)
  28 + return C, L
  29 +
  30 +
  31 +X = np.random.rand(N, d) # Features
  32 +C, L = initialize_model(X, K)
  33 +
  34 +
  35 +def gaussian(x, c, l):
  36 + '''
  37 + G function
  38 + '''
  39 + p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
  40 + p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
  41 + return (p1 * p2).reshape(1)
  42 +
  43 +
  44 +def dist(a, b, l):
  45 + '''
  46 + Distance euclidienne
  47 + '''
  48 + a = np.reshape(a, (-1, 1))
  49 + b = np.reshape(b, (-1, 1))
  50 + return np.log(gaussian(a, b, l))
  51 +
  52 +
  53 +def plot_iteration(iteration, points, clusters, centers):
  54 + fig = plt.figure()
  55 + ax = fig.add_subplot(111)
  56 + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
  57 + for i, j in centers:
  58 + ax.scatter(i, j, s=50, c='red', marker='+')
  59 + ax.set_xlabel('x')
  60 + ax.set_ylabel('y')
  61 + plt.colorbar(scatter)
  62 + plt.ylim(0, 1)
  63 + plt.xlim(0, 1)
  64 + plt.savefig("test_" + str(iteration) + ".pdf")
  65 +
  66 +
  67 +end_algo = False
  68 +i = 0
  69 +while not end_algo:
  70 + if i == 10:
  71 + exit(1)
  72 + print("Iteration: ", i)
  73 + # -- Calcul matrix distance
  74 + distances = np.zeros((N, K))
  75 +
  76 + # -- Calcul closest cluster
  77 + for n in range(N):
  78 + for k in range(K):
  79 + distances[n][k] = dist(X[n], C[k], L[k])
  80 + closest_cluster = np.argmin(distances, axis=1)
  81 +
  82 + # -- Debug tool ----------------------
  83 + if i % 1 == 0:
  84 + # TSNE
  85 + X_embedded = np.concatenate((X, C), axis=0)
  86 + # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
  87 + # Then plot
  88 + plot_iteration(
  89 + i,
  90 + X_embedded[:X.shape[0]],
  91 + closest_cluster,
  92 + X_embedded[X.shape[0]:]
  93 + )
  94 + # ------------------------------------
  95 +
  96 + end_algo = True
  97 + for k in range(K):
  98 + # Find subset of X with values closed to the centroid c_k.
  99 + X_sub = np.where(closest_cluster == k)
  100 + X_sub = np.take(X, X_sub[0], axis=0)
  101 + np.mean(X_sub, axis=0)
  102 + C_new = np.mean(X_sub, axis=0)
  103 +
  104 + # -- COMPUTE NEW LAMBDA (here named K) --
  105 + K_new = np.zeros((L.shape[1], L.shape[2]))
  106 + for x in X_sub:
  107 + x = np.reshape(x, (-1, 1))
  108 + c_tmp = np.reshape(C_new, (-1, 1))
  109 + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
  110 + K_new = K_new / X_sub.shape[0]
  111 + K_new = np.linalg.inv(K_new)
  112 +
  113 + if end_algo and (not (C[k] == C_new).all()): # If the same stop
  114 + end_algo = False
  115 + C[k] = C_new
  116 + L[k] = K_new
  117 + i = i + 1
  118 +
  119 +plot_iteration(
  120 + i,
  121 + X_embedded[:X.shape[0]],
  122 + closest_cluster,
  123 + X_embedded[X.shape[0]:]
  124 +)