Commit 4152e83df25ef19c8b048592e9629911bcf77e1a

Authored by quillotm
1 parent 3c07f672ad
Exists in master

Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …

…some problems with identity matrix usage and infinite or nan values.

Showing 3 changed files with 216 additions and 17 deletions Side-by-side Diff

... ... @@ -6,6 +6,7 @@
6 6 from sklearn.cluster import KMeans
7 7 import pickle
8 8 from clustering_modules.kmeans import kmeans
  9 +from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
9 10  
10 11 from sklearn.preprocessing import LabelEncoder
11 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
... ... @@ -15,7 +16,8 @@
15 16  
16 17  
17 18 CLUSTERING_METHODS = {
18   - "k-means": kmeans()
  19 + "k-means": kmeans(),
  20 + "k-means-mahalanobis": kmeansMahalanobis()
19 21 }
20 22  
21 23 EVALUATION_METHODS = {
... ... @@ -65,8 +67,7 @@
65 67 print(json.dumps(eval))
66 68  
67 69  
68   -
69   -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
  70 +def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
70 71 """
71 72  
72 73 @param features: output features
... ... @@ -75,6 +76,7 @@
75 76 @param kmax: maximum k to compute
76 77 @param klist: list of k values to compute, ignore k value
77 78 @param output: output file if kmax not specified, else, output directory
  79 + @param mahalanobis: distance option of k-means.
78 80 """
79 81 # -- READ FILES --
80 82 features_dict = read_features(features)
... ... @@ -91,9 +93,12 @@
91 93 # Mono value case
92 94 if kmax is None and klist is None:
93 95 print(f"Computing clustering with k={k}")
94   - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
95   - preds = kmeans.predict(X)
96   - pickle.dump(kmeans, open(output, "wb"))
  96 + model = CLUSTERING_METHODS["k-means"]
  97 + if mahalanobis:
  98 + print("Computing with mahalanobis distance")
  99 + model = CLUSTERING_METHODS["k-means-mahalanobis"]
  100 + model.fit(X, k)
  101 + model.save(output)
97 102  
98 103 # Multi values case with kmax
99 104 if kmax is not None:
... ... @@ -101,10 +106,11 @@
101 106 mkdir(output)
102 107 Ks = range(k, kmax + 1)
103 108 for i in Ks:
104   - print(f"Computing clustering with k={i}")
105   - kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
106   - preds = kmeans.predict(X)
107   - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
  109 + model = CLUSTERING_METHODS["k-means"]
  110 + if mahalanobis:
  111 + model = CLUSTERING_METHODS["k-means-mahalanobis"]
  112 + model.fit(X, i)
  113 + model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
108 114  
109 115 # Second multi values case with klist
110 116 if klist is not None:
... ... @@ -112,10 +118,12 @@
112 118 mkdir(output)
113 119 for k in klist:
114 120 k = int(k)
115   - print(f"Computing clustering with k={k}")
116   - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
117   - preds = kmeans.predict(X)
118   - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
  121 + model = CLUSTERING_METHODS["k-means"]
  122 + if mahalanobis:
  123 + print("Computing with mahalanobis distance")
  124 + model = CLUSTERING_METHODS["k-means-mahalanobis"]
  125 + model.fit(X, k)
  126 + model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
119 127  
120 128  
121 129 if __name__ == "__main__":
... ... @@ -134,7 +142,10 @@
134 142 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
135 143 parser_kmeans.add_argument("--klist", nargs="+",
136 144 help="List of k values to test. As kmax, activate the multi values mod.")
137   - parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
  145 + parser_kmeans.add_argument("--output",
  146 + default=".kmeans",
  147 + help="output file if only k. Output directory if multiple kmax specified.")
  148 + parser_kmeans.add_argument("--mahalanobis", action="store_true")
138 149 parser_kmeans.set_defaults(which="kmeans")
139 150  
140 151 # measure
volia/clustering_modules/kmeans.py
... ... @@ -8,8 +8,37 @@
8 8 self.kmeans_model = None
9 9  
10 10 def predict(self, features):
  11 + """
  12 +
  13 + @param features:
  14 + @return:
  15 + """
11 16 return self.kmeans_model.predict(features)
12 17  
13   - def load(self, model_path):
14   - self.kmeans_model = pickle.load(open(model_path, "rb"))
  18 + def load(self, model_path: str):
  19 + """
  20 +
  21 + @param model_path:
  22 + @return:
  23 + """
  24 + with open(model_path, "rb") as f:
  25 + self.kmeans_model = pickle.load(f)
  26 +
  27 + def save(self, model_path: str):
  28 + """
  29 +
  30 + @param model_path:
  31 + @return:
  32 + """
  33 + with open(model_path, "wb") as f:
  34 + pickle.dump(self.kmeans_model, f)
  35 +
  36 + def fit(self, features, k: int):
  37 + """
  38 +
  39 + @param features:
  40 + @param k:
  41 + @return:
  42 + """
  43 + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
volia/clustering_modules/kmeans_mahalanobis.py
  1 +
  2 +
  3 +from sklearn.cluster import KMeans
  4 +import pickle
  5 +import numpy as np
  6 +import matplotlib.pyplot as plt
  7 +from sklearn.manifold import TSNE
  8 +from abstract_clustering import AbstractClustering
  9 +
  10 +class kmeansMahalanobis():
  11 + def __init__(self):
  12 + """
  13 +
  14 + """
  15 + self.C = None
  16 + self.L = None
  17 + self.K = None
  18 +
  19 + def predict(self, features):
  20 + """
  21 +
  22 + @param features:
  23 + @return:
  24 + """
  25 + N = features.shape[0]
  26 + distances = np.zeros((N, self.K))
  27 + for n in range(N):
  28 + for k in range(self.K):
  29 + distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
  30 + print(distances)
  31 + closest_cluster = np.argmin(distances, axis=1)
  32 + return closest_cluster
  33 +
  34 + def load(self, model_path):
  35 + """
  36 +
  37 + @param model_path:
  38 + @return:
  39 + """
  40 + data = None
  41 + with open(model_path):
  42 + data = pickle.load()
  43 + if data is None:
  44 + raise Exception("Le modèle n'a pas pu être chargé")
  45 + else:
  46 + self.C = data["C"]
  47 + self.L = data["L"]
  48 + self.K = data["K"]
  49 +
  50 + def save(self, modelpath: str):
  51 + """
  52 +
  53 + @param modelpath:
  54 + @return:
  55 + """
  56 + data = {
  57 + "C": self.C,
  58 + "L": self.L,
  59 + "K": self.K
  60 + }
  61 + with open(modelpath, "wb") as f:
  62 + pickle.dump(data, f)
  63 +
  64 + def fit(self, features, K: int):
  65 + self._train(features, K)
  66 +
  67 + def _initialize_model(self, X, number_clusters):
  68 + d = X.shape[1]
  69 + C = X[np.random.choice(X.shape[0], number_clusters)]
  70 + L = np.zeros((number_clusters, d, d))
  71 + for k in range(number_clusters):
  72 + L[k] = np.identity(d)
  73 + return C, L
  74 +
  75 + def _dist(self, a, b, l):
  76 + '''
  77 + Distance euclidienne
  78 + '''
  79 + a = np.reshape(a, (-1, 1))
  80 + b = np.reshape(b, (-1, 1))
  81 + result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
  82 + return result
  83 +
  84 + def _plot_iteration(self, iteration, points, clusters, centers):
  85 + fig = plt.figure()
  86 + ax = fig.add_subplot(111)
  87 + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
  88 +
  89 + #for center in centers:
  90 + # ax.scatter(center[0], center[1], s=50, c='red', marker='+')
  91 + ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
  92 +
  93 + ax.set_xlabel('x')
  94 + ax.set_ylabel('y')
  95 + plt.colorbar(scatter)
  96 + #plt.ylim(0, 1)
  97 + #plt.xlim(0, 1)
  98 + plt.savefig("test_" + str(iteration) + ".pdf")
  99 +
  100 + def _train(self, features, K: int):
  101 + X = features
  102 + N = X.shape[0]
  103 + d = X.shape[1]
  104 +
  105 + C, L = self._initialize_model(X, K)
  106 + self.C = C
  107 + self.L = L
  108 + self.K = K
  109 +
  110 + end_algo = False
  111 + i = 0
  112 + while not end_algo:
  113 + if i == 10:
  114 + exit(1)
  115 + print("Iteration: ", i)
  116 + # Calcul matrix distance
  117 + distances = np.zeros((N, K))
  118 +
  119 + for n in range(N):
  120 + for k in range(self.K):
  121 + distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
  122 + print(distances)
  123 + closest_cluster = np.argmin(distances, axis=1)
  124 + if i % 1 == 0:
  125 + # -- Debug tool ----------------------
  126 + # TSNE
  127 + #X_embedded = np.concatenate((X, self.C), axis=0)
  128 + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
  129 + # Then plot
  130 + self._plot_iteration(
  131 + i,
  132 + X_embedded[:X.shape[0]],
  133 + closest_cluster,
  134 + X_embedded[X.shape[0]:]
  135 + )
  136 + # ------------------------------------
  137 +
  138 + end_algo = True
  139 + for k in range(K):
  140 + # Find subset of X with values closed to the centroid c_k.
  141 + X_sub = np.where(closest_cluster == k)
  142 + X_sub = np.take(X, X_sub[0], axis=0)
  143 + np.mean(X_sub, axis=0)
  144 + C_new = np.mean(X_sub, axis=0)
  145 +
  146 + # -- COMPUTE NEW LAMBDA (here named K) --
  147 + K_new = np.zeros((L.shape[1], L.shape[2]))
  148 + for x in X_sub:
  149 + x = np.reshape(x, (-1, 1))
  150 + c_tmp = np.reshape(C_new, (-1, 1))
  151 + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
  152 + K_new = K_new / X_sub.shape[0]
  153 + K_new = np.linalg.inv(K_new)
  154 +
  155 + if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
  156 + end_algo = False
  157 + self.C[k] = C_new
  158 + self.L[k] = K_new
  159 + i = i + 1