Commit ed89325d5d02f6e7878e3fd52498c8ad1ca653be

Authored by quillotm
1 parent d4507c2683
Exists in master

Now, we can give more parameters to k-means command. Mahalanobis was tested and …

…seems to work well. Need more tests.

Showing 3 changed files with 71 additions and 26 deletions Side-by-side Diff

... ... @@ -67,7 +67,17 @@
67 67 print(json.dumps(eval))
68 68  
69 69  
70   -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
  70 +def kmeans_run(features: str,
  71 + lst: str,
  72 + k:int,
  73 + kmax: int,
  74 + klist,
  75 + maxiter: int,
  76 + ninit: int,
  77 + output: str,
  78 + tol: float,
  79 + debug: bool = False,
  80 + mahalanobis: str = False):
71 81 """
72 82  
73 83 @param features: output features
74 84  
75 85  
... ... @@ -92,12 +102,12 @@
92 102  
93 103 # Mono value case
94 104 if kmax is None and klist is None:
95   - print(f"Computing clustering with k={k}")
  105 + if debug:
  106 + print(f"Computing clustering with k={k}")
96 107 model = CLUSTERING_METHODS["k-means"]
97 108 if mahalanobis:
98   - print("Computing with mahalanobis distance")
99 109 model = CLUSTERING_METHODS["k-means-mahalanobis"]
100   - model.fit(X, k)
  110 + model.fit(X, k, tol, maxiter, debug)
101 111 model.save(output)
102 112  
103 113 # Multi values case with kmax
... ... @@ -109,7 +119,7 @@
109 119 model = CLUSTERING_METHODS["k-means"]
110 120 if mahalanobis:
111 121 model = CLUSTERING_METHODS["k-means-mahalanobis"]
112   - model.fit(X, i)
  122 + model.fit(X, i, tol, maxiter, debug)
113 123 model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
114 124  
115 125 # Second multi values case with klist
116 126  
117 127  
118 128  
... ... @@ -120,12 +130,17 @@
120 130 k = int(k)
121 131 model = CLUSTERING_METHODS["k-means"]
122 132 if mahalanobis:
123   - print("Computing with mahalanobis distance")
124 133 model = CLUSTERING_METHODS["k-means-mahalanobis"]
125   - model.fit(X, k)
  134 + model.fit(X, k, tol, maxiter, debug)
126 135 model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
127 136  
  137 + # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
  138 + # etc. (what distance, what parameters etc)
  139 + # TODO: Move example data into a directory.
  140 + # TODO: Add example receipts
  141 + # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
128 142  
  143 +
129 144 if __name__ == "__main__":
130 145 # Main parser
131 146 parser = argparse.ArgumentParser(description="Clustering methods to apply")
... ... @@ -142,6 +157,19 @@
142 157 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
143 158 parser_kmeans.add_argument("--klist", nargs="+",
144 159 help="List of k values to test. As kmax, activate the multi values mod.")
  160 + parser_kmeans.add_argument("--maxiter",
  161 + type=int,
  162 + default=300,
  163 + help="Max number of iteration before stoping if not converging")
  164 + parser_kmeans.add_argument("--ninit",
  165 + type=int,
  166 + default=10,
  167 + help="Number of time the k-means algorithm will be run with different centroid seeds.")
  168 + parser_kmeans.add_argument("--tol",
  169 + type=float,
  170 + default=0.0001,
  171 + help="Tolerance to finish of distance between centroids and their updates.")
  172 + parser_kmeans.add_argument("--debug", action="store_true")
145 173 parser_kmeans.add_argument("--output",
146 174 default=".kmeans",
147 175 help="output file if only k. Output directory if multiple kmax specified.")
volia/clustering_modules/kmeans.py
... ... @@ -33,12 +33,12 @@
33 33 with open(model_path, "wb") as f:
34 34 pickle.dump(self.kmeans_model, f)
35 35  
36   - def fit(self, features, k: int):
  36 + def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
37 37 """
38 38  
39 39 @param features:
40 40 @param k:
41 41 @return:
42 42 """
43   - self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
  43 + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
volia/clustering_modules/kmeans_mahalanobis.py
... ... @@ -37,8 +37,8 @@
37 37 @return:
38 38 """
39 39 data = None
40   - with open(model_path):
41   - data = pickle.load()
  40 + with open(model_path, "rb") as f:
  41 + data = pickle.load(f)
42 42 if data is None:
43 43 raise Exception("Le modèle n'a pas pu être chargé")
44 44 else:
... ... @@ -60,8 +60,8 @@
60 60 with open(modelpath, "wb") as f:
61 61 pickle.dump(data, f)
62 62  
63   - def fit(self, features, K: int):
64   - self._train(features, K)
  63 + def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
  64 + self._train(features, k, tol, maxiter, debug)
65 65  
66 66 def _initialize_model(self, X, number_clusters):
67 67 d = X.shape[1]
68 68  
... ... @@ -96,11 +96,12 @@
96 96 #plt.xlim(0, 1)
97 97 plt.savefig("test_" + str(iteration) + ".pdf")
98 98  
99   - def _train(self, features, K: int):
  99 + def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
100 100 X = features
101 101 N = X.shape[0]
102 102 d = X.shape[1]
103 103  
  104 + X_embedded = None
104 105 C, L = self._initialize_model(X, K)
105 106 self.C = C
106 107 self.L = L
... ... @@ -109,9 +110,9 @@
109 110 end_algo = False
110 111 i = 0
111 112 while not end_algo:
112   - if i == 10:
113   - exit(1)
114   - print("Iteration: ", i)
  113 + if debug:
  114 + print("Iteration: ", i)
  115 +
115 116 # Calcul matrix distance
116 117 distances = np.zeros((N, K))
117 118  
... ... @@ -119,11 +120,14 @@
119 120 for k in range(self.K):
120 121 distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
121 122 closest_cluster = np.argmin(distances, axis=1)
122   - if i % 1 == 0:
123   - # -- Debug tool ----------------------
124   - # TSNE
125   - #X_embedded = np.concatenate((X, self.C), axis=0)
126   - X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
  123 +
  124 + # -- Debug tool ----------------------
  125 + if debug and i % 10 == 0:
  126 + # TSNE if needed
  127 + X_embedded = np.concatenate((X, self.C), axis=0)
  128 + if d > 2:
  129 + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
  130 +
127 131 # Then plot
128 132 self._plot_iteration(
129 133 i,
130 134  
... ... @@ -131,9 +135,9 @@
131 135 closest_cluster,
132 136 X_embedded[X.shape[0]:]
133 137 )
134   - # ------------------------------------
  138 + # ------------------------------------
135 139  
136   - end_algo = True
  140 + old_c = self.C.copy()
137 141 for k in range(K):
138 142 # Find subset of X with values closed to the centroid c_k.
139 143 X_sub = np.where(closest_cluster == k)
140 144  
141 145  
... ... @@ -152,9 +156,22 @@
152 156 K_new = K_new / X_sub.shape[0]
153 157 K_new = np.linalg.pinv(K_new)
154 158  
155   - if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
156   - end_algo = False
  159 + #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
  160 + # end_algo = False
157 161 self.C[k] = C_new
158 162 self.L[k] = K_new
  163 +
  164 + diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
  165 + if diff > tol:
  166 + end_algo = False
  167 + if debug:
  168 + print(f"{diff}")
  169 + elif debug:
  170 + print(f"Tolerance threshold {tol} reached with diff {diff}")
  171 + end_algo = True
159 172 i = i + 1
  173 + if i > maxiter:
  174 + end_algo = True
  175 + if debug:
  176 + print(f"Iteration {maxiter} reached")