Commit ed89325d5d02f6e7878e3fd52498c8ad1ca653be
1 parent
d4507c2683
Exists in
master
Now, we can give more parameters to k-means command. Mahalanobis was tested and …
…seems to work well. Need more tests.
Showing 3 changed files with 71 additions and 26 deletions Side-by-side Diff
volia/clustering.py
... | ... | @@ -67,7 +67,17 @@ |
67 | 67 | print(json.dumps(eval)) |
68 | 68 | |
69 | 69 | |
70 | -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): | |
70 | +def kmeans_run(features: str, | |
71 | + lst: str, | |
72 | + k:int, | |
73 | + kmax: int, | |
74 | + klist, | |
75 | + maxiter: int, | |
76 | + ninit: int, | |
77 | + output: str, | |
78 | + tol: float, | |
79 | + debug: bool = False, | |
80 | + mahalanobis: str = False): | |
71 | 81 | """ |
72 | 82 | |
73 | 83 | @param features: output features |
74 | 84 | |
75 | 85 | |
... | ... | @@ -92,12 +102,12 @@ |
92 | 102 | |
93 | 103 | # Mono value case |
94 | 104 | if kmax is None and klist is None: |
95 | - print(f"Computing clustering with k={k}") | |
105 | + if debug: | |
106 | + print(f"Computing clustering with k={k}") | |
96 | 107 | model = CLUSTERING_METHODS["k-means"] |
97 | 108 | if mahalanobis: |
98 | - print("Computing with mahalanobis distance") | |
99 | 109 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
100 | - model.fit(X, k) | |
110 | + model.fit(X, k, tol, maxiter, debug) | |
101 | 111 | model.save(output) |
102 | 112 | |
103 | 113 | # Multi values case with kmax |
... | ... | @@ -109,7 +119,7 @@ |
109 | 119 | model = CLUSTERING_METHODS["k-means"] |
110 | 120 | if mahalanobis: |
111 | 121 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
112 | - model.fit(X, i) | |
122 | + model.fit(X, i, tol, maxiter, debug) | |
113 | 123 | model.save(path.join(output, "clustering_" + str(i) + ".pkl")) |
114 | 124 | |
115 | 125 | # Second multi values case with klist |
116 | 126 | |
117 | 127 | |
118 | 128 | |
... | ... | @@ -120,12 +130,17 @@ |
120 | 130 | k = int(k) |
121 | 131 | model = CLUSTERING_METHODS["k-means"] |
122 | 132 | if mahalanobis: |
123 | - print("Computing with mahalanobis distance") | |
124 | 133 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
125 | - model.fit(X, k) | |
134 | + model.fit(X, k, tol, maxiter, debug) | |
126 | 135 | model.save(path.join(output, "clustering_" + str(k) + ".pkl")) |
127 | 136 | |
137 | + # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ? | |
138 | + # etc. (what distance, what parameters etc) | |
139 | + # TODO: Move example data into a directory. | |
140 | + # TODO: Add example receipts | |
141 | + # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm. | |
128 | 142 | |
143 | + | |
129 | 144 | if __name__ == "__main__": |
130 | 145 | # Main parser |
131 | 146 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
... | ... | @@ -142,6 +157,19 @@ |
142 | 157 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
143 | 158 | parser_kmeans.add_argument("--klist", nargs="+", |
144 | 159 | help="List of k values to test. As kmax, activate the multi values mod.") |
160 | + parser_kmeans.add_argument("--maxiter", | |
161 | + type=int, | |
162 | + default=300, | |
163 | + help="Max number of iteration before stoping if not converging") | |
164 | + parser_kmeans.add_argument("--ninit", | |
165 | + type=int, | |
166 | + default=10, | |
167 | + help="Number of time the k-means algorithm will be run with different centroid seeds.") | |
168 | + parser_kmeans.add_argument("--tol", | |
169 | + type=float, | |
170 | + default=0.0001, | |
171 | + help="Tolerance to finish of distance between centroids and their updates.") | |
172 | + parser_kmeans.add_argument("--debug", action="store_true") | |
145 | 173 | parser_kmeans.add_argument("--output", |
146 | 174 | default=".kmeans", |
147 | 175 | help="output file if only k. Output directory if multiple kmax specified.") |
volia/clustering_modules/kmeans.py
... | ... | @@ -33,12 +33,12 @@ |
33 | 33 | with open(model_path, "wb") as f: |
34 | 34 | pickle.dump(self.kmeans_model, f) |
35 | 35 | |
36 | - def fit(self, features, k: int): | |
36 | + def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False): | |
37 | 37 | """ |
38 | 38 | |
39 | 39 | @param features: |
40 | 40 | @param k: |
41 | 41 | @return: |
42 | 42 | """ |
43 | - self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) | |
43 | + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features) |
volia/clustering_modules/kmeans_mahalanobis.py
... | ... | @@ -37,8 +37,8 @@ |
37 | 37 | @return: |
38 | 38 | """ |
39 | 39 | data = None |
40 | - with open(model_path): | |
41 | - data = pickle.load() | |
40 | + with open(model_path, "rb") as f: | |
41 | + data = pickle.load(f) | |
42 | 42 | if data is None: |
43 | 43 | raise Exception("Le modรจle n'a pas pu รชtre chargรฉ") |
44 | 44 | else: |
... | ... | @@ -60,8 +60,8 @@ |
60 | 60 | with open(modelpath, "wb") as f: |
61 | 61 | pickle.dump(data, f) |
62 | 62 | |
63 | - def fit(self, features, K: int): | |
64 | - self._train(features, K) | |
63 | + def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False): | |
64 | + self._train(features, k, tol, maxiter, debug) | |
65 | 65 | |
66 | 66 | def _initialize_model(self, X, number_clusters): |
67 | 67 | d = X.shape[1] |
68 | 68 | |
... | ... | @@ -96,11 +96,12 @@ |
96 | 96 | #plt.xlim(0, 1) |
97 | 97 | plt.savefig("test_" + str(iteration) + ".pdf") |
98 | 98 | |
99 | - def _train(self, features, K: int): | |
99 | + def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False): | |
100 | 100 | X = features |
101 | 101 | N = X.shape[0] |
102 | 102 | d = X.shape[1] |
103 | 103 | |
104 | + X_embedded = None | |
104 | 105 | C, L = self._initialize_model(X, K) |
105 | 106 | self.C = C |
106 | 107 | self.L = L |
... | ... | @@ -109,9 +110,9 @@ |
109 | 110 | end_algo = False |
110 | 111 | i = 0 |
111 | 112 | while not end_algo: |
112 | - if i == 10: | |
113 | - exit(1) | |
114 | - print("Iteration: ", i) | |
113 | + if debug: | |
114 | + print("Iteration: ", i) | |
115 | + | |
115 | 116 | # Calcul matrix distance |
116 | 117 | distances = np.zeros((N, K)) |
117 | 118 | |
... | ... | @@ -119,11 +120,14 @@ |
119 | 120 | for k in range(self.K): |
120 | 121 | distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) |
121 | 122 | closest_cluster = np.argmin(distances, axis=1) |
122 | - if i % 1 == 0: | |
123 | - # -- Debug tool ---------------------- | |
124 | - # TSNE | |
125 | - #X_embedded = np.concatenate((X, self.C), axis=0) | |
126 | - X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | |
123 | + | |
124 | + # -- Debug tool ---------------------- | |
125 | + if debug and i % 10 == 0: | |
126 | + # TSNE if needed | |
127 | + X_embedded = np.concatenate((X, self.C), axis=0) | |
128 | + if d > 2: | |
129 | + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | |
130 | + | |
127 | 131 | # Then plot |
128 | 132 | self._plot_iteration( |
129 | 133 | i, |
130 | 134 | |
... | ... | @@ -131,9 +135,9 @@ |
131 | 135 | closest_cluster, |
132 | 136 | X_embedded[X.shape[0]:] |
133 | 137 | ) |
134 | - # ------------------------------------ | |
138 | + # ------------------------------------ | |
135 | 139 | |
136 | - end_algo = True | |
140 | + old_c = self.C.copy() | |
137 | 141 | for k in range(K): |
138 | 142 | # Find subset of X with values closed to the centroid c_k. |
139 | 143 | X_sub = np.where(closest_cluster == k) |
140 | 144 | |
141 | 145 | |
... | ... | @@ -152,9 +156,22 @@ |
152 | 156 | K_new = K_new / X_sub.shape[0] |
153 | 157 | K_new = np.linalg.pinv(K_new) |
154 | 158 | |
155 | - if end_algo and (not (self.C[k] == C_new).all()): # If the same stop | |
156 | - end_algo = False | |
159 | + #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop | |
160 | + # end_algo = False | |
157 | 161 | self.C[k] = C_new |
158 | 162 | self.L[k] = K_new |
163 | + | |
164 | + diff = np.sum(np.absolute((self.C - old_c) / old_c * 100)) | |
165 | + if diff > tol: | |
166 | + end_algo = False | |
167 | + if debug: | |
168 | + print(f"{diff}") | |
169 | + elif debug: | |
170 | + print(f"Tolerance threshold {tol} reached with diff {diff}") | |
171 | + end_algo = True | |
159 | 172 | i = i + 1 |
173 | + if i > maxiter: | |
174 | + end_algo = True | |
175 | + if debug: | |
176 | + print(f"Iteration {maxiter} reached") |