Commit a9912f135f481a97c6113e5723b33d69de6a919d
1 parent
05afc43e54
Exists in
master
We can now precise the modeltype in parameter of the kmeans learning command. Th…
…is is more permissive to evolution.
Showing 1 changed file with 14 additions and 10 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels, write_line | 4 | from core.data import read_features, read_lst, read_labels, write_line |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | 9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
10 | from clustering_modules.kmeans_multidistance import kmeansMultidistance | ||
10 | 11 | ||
11 | from sklearn.preprocessing import LabelEncoder | 12 | from sklearn.preprocessing import LabelEncoder |
12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score | 13 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
13 | 14 | ||
14 | import core.measures | 15 | import core.measures |
15 | import json | 16 | import json |
16 | 17 | ||
17 | 18 | ||
18 | CLUSTERING_METHODS = { | 19 | CLUSTERING_METHODS = { |
19 | "k-means": kmeans(), | 20 | "k-means": kmeans(), |
20 | "k-means-mahalanobis": kmeansMahalanobis(), | 21 | "k-means-mahalanobis": kmeansMahalanobis(), |
21 | "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) | 22 | "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True), |
23 | "k-means-basic-mahalanobis": kmeansMultidistance(distance="mahalanobis"), | ||
24 | "k-means-basic-cosine": kmeansMultidistance(distance="cosine") | ||
22 | } | 25 | } |
23 | 26 | ||
27 | KMEANS_METHODS = [key for key in CLUSTERING_METHODS if key.startswith("k-means")] | ||
28 | |||
24 | EVALUATION_METHODS = { | 29 | EVALUATION_METHODS = { |
25 | "entropy": core.measures.entropy_score, | 30 | "entropy": core.measures.entropy_score, |
26 | "purity": core.measures.purity_score, | 31 | "purity": core.measures.purity_score, |
27 | "v-measure": v_measure_score, | 32 | "v-measure": v_measure_score, |
28 | "homogeneity": homogeneity_score, | 33 | "homogeneity": homogeneity_score, |
29 | "completeness": completeness_score, | 34 | "completeness": completeness_score, |
30 | } | 35 | } |
31 | 36 | ||
32 | 37 | ||
33 | def disequilibrium_run(): | 38 | def disequilibrium_run(): |
34 | pass | 39 | pass |
35 | 40 | ||
36 | 41 | ||
37 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 42 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
38 | """ | 43 | """ |
39 | 44 | ||
40 | @param measure: | 45 | @param measure: |
41 | @param features: | 46 | @param features: |
42 | @param lst: | 47 | @param lst: |
43 | @param truelabels: | 48 | @param truelabels: |
44 | @param model: | 49 | @param model: |
45 | @param modeltype: | 50 | @param modeltype: |
46 | @return: | 51 | @return: |
47 | """ | 52 | """ |
48 | module = CLUSTERING_METHODS[modeltype] | 53 | module = CLUSTERING_METHODS[modeltype] |
49 | module.load(model) | 54 | module.load(model) |
50 | 55 | ||
51 | eval = {} | 56 | eval = {} |
52 | for ms in measure: | 57 | for ms in measure: |
53 | evaluation = EVALUATION_METHODS[ms] | 58 | evaluation = EVALUATION_METHODS[ms] |
54 | feats_dict = read_features(features) | 59 | feats_dict = read_features(features) |
55 | labels_dict = read_labels(truelabels) | 60 | labels_dict = read_labels(truelabels) |
56 | lst_dict = read_lst(lst) | 61 | lst_dict = read_lst(lst) |
57 | lst_keys = [key for key in lst_dict] | 62 | lst_keys = [key for key in lst_dict] |
58 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 63 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
59 | Y_pred = module.predict(feats) | 64 | Y_pred = module.predict(feats) |
60 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 65 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
61 | 66 | ||
62 | le = LabelEncoder() | 67 | le = LabelEncoder() |
63 | le.fit(Y_truth) | 68 | le.fit(Y_truth) |
64 | Y_truth = le.transform(Y_truth) | 69 | Y_truth = le.transform(Y_truth) |
65 | 70 | ||
66 | eval[ms] = evaluation(Y_truth, Y_pred) | 71 | eval[ms] = evaluation(Y_truth, Y_pred) |
67 | 72 | ||
68 | print(json.dumps(eval)) | 73 | print(json.dumps(eval)) |
69 | 74 | ||
70 | 75 | ||
71 | def kmeans_run(features: str, | 76 | def kmeans_run(features: str, |
72 | lst: str, | 77 | lst: str, |
73 | k:int, | 78 | k:int, |
74 | kmax: int, | 79 | kmax: int, |
75 | klist, | 80 | klist, |
76 | maxiter: int, | 81 | maxiter: int, |
77 | ninit: int, | 82 | ninit: int, |
78 | output: str, | 83 | output: str, |
79 | tol: float, | 84 | tol: float, |
80 | debug: bool = False, | 85 | modeltype: str, |
81 | mahalanobis: str = False): | 86 | debug: bool = False): |
82 | """ | 87 | """ |
83 | 88 | ||
84 | @param features: output features | 89 | @param features: output features |
85 | @param lst: list file | 90 | @param lst: list file |
86 | @param k: k (kmin if kmax specified) | 91 | @param k: k (kmin if kmax specified) |
87 | @param kmax: maximum k to compute | 92 | @param kmax: maximum k to compute |
88 | @param klist: list of k values to compute, ignore k value | 93 | @param klist: list of k values to compute, ignore k value |
89 | @param output: output file if kmax not specified, else, output directory | 94 | @param output: output file if kmax not specified, else, output directory |
90 | @param mahalanobis: distance option of k-means. | 95 | @param mahalanobis: distance option of k-means. |
91 | """ | 96 | """ |
92 | json_content = locals().copy() | 97 | json_content = locals().copy() |
93 | 98 | ||
94 | def fit_model(k: int, output_file): | 99 | def fit_model(k: int, output_file): |
95 | if debug: | 100 | if debug: |
96 | print(f"Computing clustering with k={k}") | 101 | print(f"Computing clustering with k={k}") |
97 | model = CLUSTERING_METHODS["k-means"] | 102 | model = CLUSTERING_METHODS[modeltype] |
98 | if mahalanobis: | ||
99 | if debug: | ||
100 | print("Mahalanobis activated") | ||
101 | model = CLUSTERING_METHODS["k-means-mahalanobis-constrained"] | ||
102 | model.fit(X, k, tol, ninit, maxiter, debug) | 103 | model.fit(X, k, tol, ninit, maxiter, debug) |
103 | model.save(output_file) | 104 | model.save(output_file) |
104 | json_content["models"].append({ | 105 | json_content["models"].append({ |
105 | "model_file": output_file, | 106 | "model_file": output_file, |
106 | "k": k, | 107 | "k": k, |
107 | }) | 108 | }) |
108 | 109 | ||
109 | json_content["models"] = [] | 110 | json_content["models"] = [] |
110 | 111 | ||
111 | # -- READ FILES -- | 112 | # -- READ FILES -- |
112 | features_dict = read_features(features) | 113 | features_dict = read_features(features) |
113 | lst_dict = read_lst(lst) | 114 | lst_dict = read_lst(lst) |
114 | X = np.asarray([features_dict[x] for x in lst_dict]) | 115 | X = np.asarray([features_dict[x] for x in lst_dict]) |
115 | 116 | ||
116 | # Exception cases | 117 | # Exception cases |
117 | if kmax is None and klist is None and path.isdir(output): | 118 | if kmax is None and klist is None and path.isdir(output): |
118 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 119 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
119 | 120 | ||
120 | if (kmax is not None or klist is not None) and path.isfile(output): | 121 | if (kmax is not None or klist is not None) and path.isfile(output): |
121 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 122 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
122 | 123 | ||
123 | # Mono value case | 124 | # Mono value case |
124 | if kmax is None and klist is None: | 125 | if kmax is None and klist is None: |
125 | fit_model(k, output) | 126 | fit_model(k, output) |
126 | 127 | ||
127 | # Multi values case with kmax | 128 | # Multi values case with kmax |
128 | if kmax is not None: | 129 | if kmax is not None: |
129 | if not path.isdir(output): | 130 | if not path.isdir(output): |
130 | mkdir(output) | 131 | mkdir(output) |
131 | Ks = range(k, kmax + 1) | 132 | Ks = range(k, kmax + 1) |
132 | for i in Ks: | 133 | for i in Ks: |
133 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) | 134 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) |
134 | 135 | ||
135 | # Second multi values case with klist | 136 | # Second multi values case with klist |
136 | if klist is not None: | 137 | if klist is not None: |
137 | if not path.isdir(output): | 138 | if not path.isdir(output): |
138 | mkdir(output) | 139 | mkdir(output) |
139 | for k in klist: | 140 | for k in klist: |
140 | k = int(k) | 141 | k = int(k) |
141 | fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) | 142 | fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) |
142 | 143 | ||
143 | print(json.dumps(json_content)) | 144 | print(json.dumps(json_content)) |
144 | 145 | ||
145 | 146 | ||
146 | def extract_run(features, lst, model, modeltype, outfile): | 147 | def extract_run(features, lst, model, modeltype, outfile): |
147 | feats_dict = read_features(features) | 148 | feats_dict = read_features(features) |
148 | lst_dict = read_lst(lst) | 149 | lst_dict = read_lst(lst) |
149 | lst_keys = [key for key in lst_dict] | 150 | lst_keys = [key for key in lst_dict] |
150 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 151 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
151 | 152 | ||
152 | module = CLUSTERING_METHODS[modeltype] | 153 | module = CLUSTERING_METHODS[modeltype] |
153 | module.load(model) | 154 | module.load(model) |
154 | Y_pred = module.predict(feats) | 155 | Y_pred = module.predict(feats) |
155 | with open(outfile, "w") as f: | 156 | with open(outfile, "w") as f: |
156 | for i, key in enumerate(lst_keys): | 157 | for i, key in enumerate(lst_keys): |
157 | write_line(key, Y_pred[i], f) | 158 | write_line(key, Y_pred[i], f) |
158 | json_output = { | 159 | json_output = { |
159 | "outfile": outfile | 160 | "outfile": outfile |
160 | } | 161 | } |
161 | print(json.dumps(json_output)) | 162 | print(json.dumps(json_output)) |
162 | 163 | ||
163 | 164 | ||
164 | if __name__ == "__main__": | 165 | if __name__ == "__main__": |
165 | # Main parser | 166 | # Main parser |
166 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 167 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
167 | subparsers = parser.add_subparsers(title="action") | 168 | subparsers = parser.add_subparsers(title="action") |
168 | 169 | ||
169 | # kmeans | 170 | # kmeans |
170 | parser_kmeans = subparsers.add_parser( | 171 | parser_kmeans = subparsers.add_parser( |
171 | "kmeans", help="Compute clustering using k-means algorithm") | 172 | "kmeans", help="Compute clustering using k-means algorithm") |
172 | 173 | ||
173 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 174 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
174 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 175 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
175 | parser_kmeans.add_argument("-k", default=2, type=int, | 176 | parser_kmeans.add_argument("-k", default=2, type=int, |
176 | help="number of clusters to compute. It is kmin if kmax is specified.") | 177 | help="number of clusters to compute. It is kmin if kmax is specified.") |
177 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 178 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
178 | parser_kmeans.add_argument("--klist", nargs="+", | 179 | parser_kmeans.add_argument("--klist", nargs="+", |
179 | help="List of k values to test. As kmax, activate the multi values mod.") | 180 | help="List of k values to test. As kmax, activate the multi values mod.") |
180 | parser_kmeans.add_argument("--maxiter", | 181 | parser_kmeans.add_argument("--maxiter", |
181 | type=int, | 182 | type=int, |
182 | default=300, | 183 | default=300, |
183 | help="Max number of iteration before stoping if not converging") | 184 | help="Max number of iteration before stoping if not converging") |
184 | parser_kmeans.add_argument("--ninit", | 185 | parser_kmeans.add_argument("--ninit", |
185 | type=int, | 186 | type=int, |
186 | default=10, | 187 | default=10, |
187 | help="Number of time the k-means algorithm will be run with different centroid seeds.") | 188 | help="Number of time the k-means algorithm will be run with different centroid seeds.") |
188 | parser_kmeans.add_argument("--tol", | 189 | parser_kmeans.add_argument("--tol", |
189 | type=float, | 190 | type=float, |
190 | default=0.0001, | 191 | default=0.0001, |
191 | help="Tolerance to finish of distance between centroids and their updates.") | 192 | help="Tolerance to finish of distance between centroids and their updates.") |
192 | parser_kmeans.add_argument("--debug", action="store_true") | 193 | parser_kmeans.add_argument("--debug", action="store_true") |
193 | parser_kmeans.add_argument("--output", | 194 | parser_kmeans.add_argument("--output", |
194 | default=".kmeans", | 195 | default=".kmeans", |
195 | help="output file if only k. Output directory if multiple kmax specified.") | 196 | help="output file if only k. Output directory if multiple kmax specified.") |
196 | parser_kmeans.add_argument("--mahalanobis", action="store_true") | 197 | parser_kmeans.add_argument("--modeltype", |
198 | required=True, | ||
199 | choices=KMEANS_METHODS, | ||
200 | help="type of model for learning") | ||
197 | parser_kmeans.set_defaults(which="kmeans") | 201 | parser_kmeans.set_defaults(which="kmeans") |
198 | 202 | ||
199 | # measure | 203 | # measure |
200 | parser_measure = subparsers.add_parser( | 204 | parser_measure = subparsers.add_parser( |
201 | "measure", help="compute the entropy") | 205 | "measure", help="compute the entropy") |
202 | 206 | ||
203 | parser_measure.add_argument("--measure", | 207 | parser_measure.add_argument("--measure", |
204 | required=True, | 208 | required=True, |
205 | nargs="+", | 209 | nargs="+", |
206 | choices=[key for key in EVALUATION_METHODS], | 210 | choices=[key for key in EVALUATION_METHODS], |
207 | help="...") | 211 | help="...") |
208 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 212 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
209 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 213 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
210 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 214 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
211 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 215 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
212 | parser_measure.add_argument("--modeltype", | 216 | parser_measure.add_argument("--modeltype", |
213 | required=True, | 217 | required=True, |
214 | choices=[key for key in CLUSTERING_METHODS], | 218 | choices=[key for key in CLUSTERING_METHODS], |
215 | help="type of model for learning") | 219 | help="type of model for learning") |
216 | parser_measure.set_defaults(which="measure") | 220 | parser_measure.set_defaults(which="measure") |
217 | 221 | ||
218 | # disequilibrium | 222 | # disequilibrium |
219 | parser_disequilibrium = subparsers.add_parser( | 223 | parser_disequilibrium = subparsers.add_parser( |
220 | "disequilibrium", help="...") | 224 | "disequilibrium", help="...") |
221 | 225 | ||
222 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 226 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
223 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 227 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
224 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 228 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
225 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 229 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
226 | parser_disequilibrium.add_argument("--model-type", | 230 | parser_disequilibrium.add_argument("--modeltype", |
227 | required=True, | 231 | required=True, |
228 | choices=["kmeans", "2", "3"], | 232 | choices=["kmeans", "2", "3"], |
229 | help="...") | 233 | help="...") |
230 | parser_disequilibrium.set_defaults(which="disequilibrium") | 234 | parser_disequilibrium.set_defaults(which="disequilibrium") |
231 | 235 | ||
232 | # Extract | 236 | # Extract |
233 | parser_extract = subparsers.add_parser( | 237 | parser_extract = subparsers.add_parser( |
234 | "extract", help="extract cluster labels") | 238 | "extract", help="extract cluster labels") |
235 | 239 | ||
236 | parser_extract.add_argument("--features", required=True, type=str, help="...") | 240 | parser_extract.add_argument("--features", required=True, type=str, help="...") |
237 | parser_extract.add_argument("--lst", required=True, type=str, help="...") | 241 | parser_extract.add_argument("--lst", required=True, type=str, help="...") |
238 | parser_extract.add_argument("--model", required=True, type=str, help="...") | 242 | parser_extract.add_argument("--model", required=True, type=str, help="...") |
239 | parser_extract.add_argument("--modeltype", | 243 | parser_extract.add_argument("--modeltype", |
240 | required=True, | 244 | required=True, |
241 | choices=[key for key in CLUSTERING_METHODS], | 245 | choices=[key for key in CLUSTERING_METHODS], |
242 | help="type of model for learning") | 246 | help="type of model for learning") |
243 | parser_extract.add_argument("--outfile", required=True, type=str, help="...") | 247 | parser_extract.add_argument("--outfile", required=True, type=str, help="...") |
244 | parser_extract.set_defaults(which="extract") | 248 | parser_extract.set_defaults(which="extract") |
245 | 249 | ||
246 | # Parse | 250 | # Parse |
247 | args = parser.parse_args() | 251 | args = parser.parse_args() |
248 | 252 | ||
249 | # Run commands | 253 | # Run commands |
250 | runner = SubCommandRunner({ | 254 | runner = SubCommandRunner({ |
251 | "kmeans": kmeans_run, | 255 | "kmeans": kmeans_run, |
252 | "measure": measure_run, | 256 | "measure": measure_run, |
253 | "disequilibrium": disequilibrium_run, | 257 | "disequilibrium": disequilibrium_run, |
254 | "extract": extract_run | 258 | "extract": extract_run |