Blame view

bin/cluster_kmeans_constrained_mahalanobis.py 3.25 KB
a79344696   Mathias Quillot   Implementation of...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
  '''
  Un petit test pour faire du clustering
  avec une distance de mahalanobis
  From paper:
  Convergence problems of Mahalanobis distance-based k-means clustering,
  Itshak Lapidot
  
  Just one thing: Column and lines are inversed in this script.
  TODO: Random selection from the set
  '''
  
  import matplotlib.pyplot as plt
  import numpy as np
  # from sklearn.manifold import TSNE
  
  N = 50  # Number of individus
  d = 2  # Number of dimensions
  K = 3  # number of clusters
  
  
  def initialize_model(X, number_clusters):
      rand = np.random.choice(X.shape[0], number_clusters)
      C = X[rand]
      L = np.zeros((K, d, d))
      for k in range(K):
          L[k] = np.identity(d)
      return C, L
  
  
  X = np.random.rand(N, d)  # Features
  C, L = initialize_model(X, K)
  
  
  def gaussian(x, c, l):
      '''
      G function
      '''
      p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
      p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
      return (p1 * p2).reshape(1)
  
  
  def dist(a, b, l):
      '''
      Distance euclidienne
      '''
      a = np.reshape(a, (-1, 1))
      b = np.reshape(b, (-1, 1))
      result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
      return result
  
  
  def plot_iteration(iteration, points, clusters, centers):
      fig = plt.figure()
      ax = fig.add_subplot(111)
      scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
      for i, j in centers:
          ax.scatter(i, j, s=50, c='red', marker='+')
      ax.set_xlabel('x')
      ax.set_ylabel('y')
      plt.colorbar(scatter)
      plt.ylim(0, 1)
      plt.xlim(0, 1)
      plt.savefig("test_" + str(iteration) + ".pdf")
  
  
  end_algo = False
  i = 0
  while not end_algo:
      if i == 10:
          exit(1)
      print("Iteration: ", i)
      # -- Calcul matrix distance
      distances = np.zeros((N, K))
  
      # -- Calcul closest cluster
      for n in range(N):
          for k in range(K):
              distances[n][k] = dist(X[n], C[k], L[k])
      closest_cluster = np.argmin(distances, axis=1)
  
      # -- Debug tool ----------------------
      if i % 1 == 0:
          # TSNE
          X_embedded = np.concatenate((X, C), axis=0)
          # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
          # Then plot
          plot_iteration(
              i,
              X_embedded[:X.shape[0]],
              closest_cluster,
              X_embedded[X.shape[0]:]
              )
      # ------------------------------------
  
      end_algo = True
      for k in range(K):
          # Find subset of X with values closed to the centroid c_k.
          X_sub = np.where(closest_cluster == k)
          X_sub = np.take(X, X_sub[0], axis=0)
          np.mean(X_sub, axis=0)
          C_new = np.mean(X_sub, axis=0)
  
          # -- COMPUTE NEW LAMBDA (here named K) --
          K_new = np.zeros((L.shape[1], L.shape[2]))
          for x in X_sub:
              x = np.reshape(x, (-1, 1))
              c_tmp = np.reshape(C_new, (-1, 1))
              K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
          K_new = K_new / X_sub.shape[0]
          K_new = K_new / np.power(np.linalg.det(K_new), 1/d)
          K_new = np.linalg.inv(K_new)
  
          if end_algo and (not (C[k] == C_new).all()):  # If the same stop
              end_algo = False
          C[k] = C_new
          L[k] = K_new
      i = i + 1
  
  plot_iteration(
      i,
      X_embedded[:X.shape[0]],
      closest_cluster,
      X_embedded[X.shape[0]:]
  )