Quillot Mathias / Clustering

Blame view

bin/cluster_kmeans_mahalanobis.py 2.9 KB
  '''
  Un petit test pour faire du clustering
  avec une distance de mahalanobis
  From paper: 
  Convergence problems of Mahalanobis distance-based k-means clustering
  
  Just one thing: Column and lines are inversed in this script. 
  '''
  
  import matplotlib.pyplot as plt
  import numpy as np
  # from sklearn.manifold import TSNE
  
  
  N = 50  # Number of individus
  d = 2  # Number of dimensions
  K = 3  # number of clusters
  
  
  def initialize_model(X, number_clusters):
      C = X[np.random.choice(X.shape[0], number_clusters)]
      L = np.zeros((K, d, d))
      for k in range(K):
          L[k] = np.identity(d)
      return C, L
  
  
  X = np.random.rand(N, d)  # Features
  
  C, L = initialize_model(X, K)
  
  
  def dist(a, b, l):
      '''
      Distance euclidienne
      '''
      a = np.reshape(a, (-1, 1))
      b = np.reshape(b, (-1, 1))
      result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
      return result
  
  
  def plot_iteration(iteration, points, clusters, centers):
      fig = plt.figure()
      ax = fig.add_subplot(111)
      scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
      for i, j in centers:
          ax.scatter(i, j, s=50, c='red', marker='+')
      ax.set_xlabel('x')
      ax.set_ylabel('y')
      plt.colorbar(scatter)
      plt.ylim(0, 1)
      plt.xlim(0, 1)
      plt.savefig("test_" + str(iteration) + ".pdf")
  
  
  end_algo = False
  i = 0
  while not end_algo:
      if i == 10:
          exit(1)
      print("Iteration: ", i)
      # Calcul matrix distance
      distances = np.zeros((N, K))
  
      for n in range(N):
          for k in range(K):
              distances[n][k] = dist(X[n], C[k], L[k])
      print(distances)
      closest_cluster = np.argmin(distances, axis=1)
      if i % 1 == 0:
          # -- Debug tool ----------------------
          # TSNE
          X_embedded = np.concatenate((X, C), axis=0)
          # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
          # Then plot
          plot_iteration(
              i,
              X_embedded[:X.shape[0]],
              closest_cluster,
              X_embedded[X.shape[0]:]
              )
          # ------------------------------------
  
      end_algo = True
      for k in range(K):
          # Find subset of X with values closed to the centroid c_k.
          X_sub = np.where(closest_cluster == k)
          X_sub = np.take(X, X_sub[0], axis=0)
          np.mean(X_sub, axis=0)
          C_new = np.mean(X_sub, axis=0)
  
          # -- COMPUTE NEW LAMBDA (here named K) --
          K_new = np.zeros((L.shape[1], L.shape[2]))
          for x in X_sub:
              x = np.reshape(x, (-1, 1))
              c_tmp = np.reshape(C_new, (-1, 1))
              K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
          K_new = K_new / X_sub.shape[0]
          K_new = np.linalg.inv(K_new)
  
          if end_algo and (not (C[k] == C_new).all()):  # If the same stop
              end_algo = False
          C[k] = C_new
          L[k] = K_new
      i = i + 1
  
  plot_iteration(
      i,
      X_embedded[:X.shape[0]],
      closest_cluster,
      X_embedded[X.shape[0]:]
  )