Commit a9912f135f481a97c6113e5723b33d69de6a919d

Authored by quillotm
1 parent 05afc43e54
Exists in master

We can now precise the modeltype in parameter of the kmeans learning command. Th…

…is is more permissive to evolution.

Showing 1 changed file with 14 additions and 10 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels, write_line 4 from core.data import read_features, read_lst, read_labels, write_line
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis 9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
10 from clustering_modules.kmeans_multidistance import kmeansMultidistance
10 11
11 from sklearn.preprocessing import LabelEncoder 12 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 13 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
13 14
14 import core.measures 15 import core.measures
15 import json 16 import json
16 17
17 18
18 CLUSTERING_METHODS = { 19 CLUSTERING_METHODS = {
19 "k-means": kmeans(), 20 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis(), 21 "k-means-mahalanobis": kmeansMahalanobis(),
21 "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) 22 "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True),
23 "k-means-basic-mahalanobis": kmeansMultidistance(distance="mahalanobis"),
24 "k-means-basic-cosine": kmeansMultidistance(distance="cosine")
22 } 25 }
23 26
27 KMEANS_METHODS = [key for key in CLUSTERING_METHODS if key.startswith("k-means")]
28
24 EVALUATION_METHODS = { 29 EVALUATION_METHODS = {
25 "entropy": core.measures.entropy_score, 30 "entropy": core.measures.entropy_score,
26 "purity": core.measures.purity_score, 31 "purity": core.measures.purity_score,
27 "v-measure": v_measure_score, 32 "v-measure": v_measure_score,
28 "homogeneity": homogeneity_score, 33 "homogeneity": homogeneity_score,
29 "completeness": completeness_score, 34 "completeness": completeness_score,
30 } 35 }
31 36
32 37
33 def disequilibrium_run(): 38 def disequilibrium_run():
34 pass 39 pass
35 40
36 41
37 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 42 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
38 """ 43 """
39 44
40 @param measure: 45 @param measure:
41 @param features: 46 @param features:
42 @param lst: 47 @param lst:
43 @param truelabels: 48 @param truelabels:
44 @param model: 49 @param model:
45 @param modeltype: 50 @param modeltype:
46 @return: 51 @return:
47 """ 52 """
48 module = CLUSTERING_METHODS[modeltype] 53 module = CLUSTERING_METHODS[modeltype]
49 module.load(model) 54 module.load(model)
50 55
51 eval = {} 56 eval = {}
52 for ms in measure: 57 for ms in measure:
53 evaluation = EVALUATION_METHODS[ms] 58 evaluation = EVALUATION_METHODS[ms]
54 feats_dict = read_features(features) 59 feats_dict = read_features(features)
55 labels_dict = read_labels(truelabels) 60 labels_dict = read_labels(truelabels)
56 lst_dict = read_lst(lst) 61 lst_dict = read_lst(lst)
57 lst_keys = [key for key in lst_dict] 62 lst_keys = [key for key in lst_dict]
58 feats = np.asarray([feats_dict[key] for key in lst_keys]) 63 feats = np.asarray([feats_dict[key] for key in lst_keys])
59 Y_pred = module.predict(feats) 64 Y_pred = module.predict(feats)
60 Y_truth = [labels_dict[key][0] for key in lst_keys] 65 Y_truth = [labels_dict[key][0] for key in lst_keys]
61 66
62 le = LabelEncoder() 67 le = LabelEncoder()
63 le.fit(Y_truth) 68 le.fit(Y_truth)
64 Y_truth = le.transform(Y_truth) 69 Y_truth = le.transform(Y_truth)
65 70
66 eval[ms] = evaluation(Y_truth, Y_pred) 71 eval[ms] = evaluation(Y_truth, Y_pred)
67 72
68 print(json.dumps(eval)) 73 print(json.dumps(eval))
69 74
70 75
71 def kmeans_run(features: str, 76 def kmeans_run(features: str,
72 lst: str, 77 lst: str,
73 k:int, 78 k:int,
74 kmax: int, 79 kmax: int,
75 klist, 80 klist,
76 maxiter: int, 81 maxiter: int,
77 ninit: int, 82 ninit: int,
78 output: str, 83 output: str,
79 tol: float, 84 tol: float,
80 debug: bool = False, 85 modeltype: str,
81 mahalanobis: str = False): 86 debug: bool = False):
82 """ 87 """
83 88
84 @param features: output features 89 @param features: output features
85 @param lst: list file 90 @param lst: list file
86 @param k: k (kmin if kmax specified) 91 @param k: k (kmin if kmax specified)
87 @param kmax: maximum k to compute 92 @param kmax: maximum k to compute
88 @param klist: list of k values to compute, ignore k value 93 @param klist: list of k values to compute, ignore k value
89 @param output: output file if kmax not specified, else, output directory 94 @param output: output file if kmax not specified, else, output directory
90 @param mahalanobis: distance option of k-means. 95 @param mahalanobis: distance option of k-means.
91 """ 96 """
92 json_content = locals().copy() 97 json_content = locals().copy()
93 98
94 def fit_model(k: int, output_file): 99 def fit_model(k: int, output_file):
95 if debug: 100 if debug:
96 print(f"Computing clustering with k={k}") 101 print(f"Computing clustering with k={k}")
97 model = CLUSTERING_METHODS["k-means"] 102 model = CLUSTERING_METHODS[modeltype]
98 if mahalanobis:
99 if debug:
100 print("Mahalanobis activated")
101 model = CLUSTERING_METHODS["k-means-mahalanobis-constrained"]
102 model.fit(X, k, tol, ninit, maxiter, debug) 103 model.fit(X, k, tol, ninit, maxiter, debug)
103 model.save(output_file) 104 model.save(output_file)
104 json_content["models"].append({ 105 json_content["models"].append({
105 "model_file": output_file, 106 "model_file": output_file,
106 "k": k, 107 "k": k,
107 }) 108 })
108 109
109 json_content["models"] = [] 110 json_content["models"] = []
110 111
111 # -- READ FILES -- 112 # -- READ FILES --
112 features_dict = read_features(features) 113 features_dict = read_features(features)
113 lst_dict = read_lst(lst) 114 lst_dict = read_lst(lst)
114 X = np.asarray([features_dict[x] for x in lst_dict]) 115 X = np.asarray([features_dict[x] for x in lst_dict])
115 116
116 # Exception cases 117 # Exception cases
117 if kmax is None and klist is None and path.isdir(output): 118 if kmax is None and klist is None and path.isdir(output):
118 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 119 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
119 120
120 if (kmax is not None or klist is not None) and path.isfile(output): 121 if (kmax is not None or klist is not None) and path.isfile(output):
121 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 122 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
122 123
123 # Mono value case 124 # Mono value case
124 if kmax is None and klist is None: 125 if kmax is None and klist is None:
125 fit_model(k, output) 126 fit_model(k, output)
126 127
127 # Multi values case with kmax 128 # Multi values case with kmax
128 if kmax is not None: 129 if kmax is not None:
129 if not path.isdir(output): 130 if not path.isdir(output):
130 mkdir(output) 131 mkdir(output)
131 Ks = range(k, kmax + 1) 132 Ks = range(k, kmax + 1)
132 for i in Ks: 133 for i in Ks:
133 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) 134 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
134 135
135 # Second multi values case with klist 136 # Second multi values case with klist
136 if klist is not None: 137 if klist is not None:
137 if not path.isdir(output): 138 if not path.isdir(output):
138 mkdir(output) 139 mkdir(output)
139 for k in klist: 140 for k in klist:
140 k = int(k) 141 k = int(k)
141 fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) 142 fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl"))
142 143
143 print(json.dumps(json_content)) 144 print(json.dumps(json_content))
144 145
145 146
146 def extract_run(features, lst, model, modeltype, outfile): 147 def extract_run(features, lst, model, modeltype, outfile):
147 feats_dict = read_features(features) 148 feats_dict = read_features(features)
148 lst_dict = read_lst(lst) 149 lst_dict = read_lst(lst)
149 lst_keys = [key for key in lst_dict] 150 lst_keys = [key for key in lst_dict]
150 feats = np.asarray([feats_dict[key] for key in lst_keys]) 151 feats = np.asarray([feats_dict[key] for key in lst_keys])
151 152
152 module = CLUSTERING_METHODS[modeltype] 153 module = CLUSTERING_METHODS[modeltype]
153 module.load(model) 154 module.load(model)
154 Y_pred = module.predict(feats) 155 Y_pred = module.predict(feats)
155 with open(outfile, "w") as f: 156 with open(outfile, "w") as f:
156 for i, key in enumerate(lst_keys): 157 for i, key in enumerate(lst_keys):
157 write_line(key, Y_pred[i], f) 158 write_line(key, Y_pred[i], f)
158 json_output = { 159 json_output = {
159 "outfile": outfile 160 "outfile": outfile
160 } 161 }
161 print(json.dumps(json_output)) 162 print(json.dumps(json_output))
162 163
163 164
164 if __name__ == "__main__": 165 if __name__ == "__main__":
165 # Main parser 166 # Main parser
166 parser = argparse.ArgumentParser(description="Clustering methods to apply") 167 parser = argparse.ArgumentParser(description="Clustering methods to apply")
167 subparsers = parser.add_subparsers(title="action") 168 subparsers = parser.add_subparsers(title="action")
168 169
169 # kmeans 170 # kmeans
170 parser_kmeans = subparsers.add_parser( 171 parser_kmeans = subparsers.add_parser(
171 "kmeans", help="Compute clustering using k-means algorithm") 172 "kmeans", help="Compute clustering using k-means algorithm")
172 173
173 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 174 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
174 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 175 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
175 parser_kmeans.add_argument("-k", default=2, type=int, 176 parser_kmeans.add_argument("-k", default=2, type=int,
176 help="number of clusters to compute. It is kmin if kmax is specified.") 177 help="number of clusters to compute. It is kmin if kmax is specified.")
177 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 178 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
178 parser_kmeans.add_argument("--klist", nargs="+", 179 parser_kmeans.add_argument("--klist", nargs="+",
179 help="List of k values to test. As kmax, activate the multi values mod.") 180 help="List of k values to test. As kmax, activate the multi values mod.")
180 parser_kmeans.add_argument("--maxiter", 181 parser_kmeans.add_argument("--maxiter",
181 type=int, 182 type=int,
182 default=300, 183 default=300,
183 help="Max number of iteration before stoping if not converging") 184 help="Max number of iteration before stoping if not converging")
184 parser_kmeans.add_argument("--ninit", 185 parser_kmeans.add_argument("--ninit",
185 type=int, 186 type=int,
186 default=10, 187 default=10,
187 help="Number of time the k-means algorithm will be run with different centroid seeds.") 188 help="Number of time the k-means algorithm will be run with different centroid seeds.")
188 parser_kmeans.add_argument("--tol", 189 parser_kmeans.add_argument("--tol",
189 type=float, 190 type=float,
190 default=0.0001, 191 default=0.0001,
191 help="Tolerance to finish of distance between centroids and their updates.") 192 help="Tolerance to finish of distance between centroids and their updates.")
192 parser_kmeans.add_argument("--debug", action="store_true") 193 parser_kmeans.add_argument("--debug", action="store_true")
193 parser_kmeans.add_argument("--output", 194 parser_kmeans.add_argument("--output",
194 default=".kmeans", 195 default=".kmeans",
195 help="output file if only k. Output directory if multiple kmax specified.") 196 help="output file if only k. Output directory if multiple kmax specified.")
196 parser_kmeans.add_argument("--mahalanobis", action="store_true") 197 parser_kmeans.add_argument("--modeltype",
198 required=True,
199 choices=KMEANS_METHODS,
200 help="type of model for learning")
197 parser_kmeans.set_defaults(which="kmeans") 201 parser_kmeans.set_defaults(which="kmeans")
198 202
199 # measure 203 # measure
200 parser_measure = subparsers.add_parser( 204 parser_measure = subparsers.add_parser(
201 "measure", help="compute the entropy") 205 "measure", help="compute the entropy")
202 206
203 parser_measure.add_argument("--measure", 207 parser_measure.add_argument("--measure",
204 required=True, 208 required=True,
205 nargs="+", 209 nargs="+",
206 choices=[key for key in EVALUATION_METHODS], 210 choices=[key for key in EVALUATION_METHODS],
207 help="...") 211 help="...")
208 parser_measure.add_argument("--features", required=True, type=str, help="...") 212 parser_measure.add_argument("--features", required=True, type=str, help="...")
209 parser_measure.add_argument("--lst", required=True, type=str, help="...") 213 parser_measure.add_argument("--lst", required=True, type=str, help="...")
210 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 214 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
211 parser_measure.add_argument("--model", required=True, type=str, help="...") 215 parser_measure.add_argument("--model", required=True, type=str, help="...")
212 parser_measure.add_argument("--modeltype", 216 parser_measure.add_argument("--modeltype",
213 required=True, 217 required=True,
214 choices=[key for key in CLUSTERING_METHODS], 218 choices=[key for key in CLUSTERING_METHODS],
215 help="type of model for learning") 219 help="type of model for learning")
216 parser_measure.set_defaults(which="measure") 220 parser_measure.set_defaults(which="measure")
217 221
218 # disequilibrium 222 # disequilibrium
219 parser_disequilibrium = subparsers.add_parser( 223 parser_disequilibrium = subparsers.add_parser(
220 "disequilibrium", help="...") 224 "disequilibrium", help="...")
221 225
222 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 226 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
223 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 227 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
224 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 228 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
225 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 229 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
226 parser_disequilibrium.add_argument("--model-type", 230 parser_disequilibrium.add_argument("--modeltype",
227 required=True, 231 required=True,
228 choices=["kmeans", "2", "3"], 232 choices=["kmeans", "2", "3"],
229 help="...") 233 help="...")
230 parser_disequilibrium.set_defaults(which="disequilibrium") 234 parser_disequilibrium.set_defaults(which="disequilibrium")
231 235
232 # Extract 236 # Extract
233 parser_extract = subparsers.add_parser( 237 parser_extract = subparsers.add_parser(
234 "extract", help="extract cluster labels") 238 "extract", help="extract cluster labels")
235 239
236 parser_extract.add_argument("--features", required=True, type=str, help="...") 240 parser_extract.add_argument("--features", required=True, type=str, help="...")
237 parser_extract.add_argument("--lst", required=True, type=str, help="...") 241 parser_extract.add_argument("--lst", required=True, type=str, help="...")
238 parser_extract.add_argument("--model", required=True, type=str, help="...") 242 parser_extract.add_argument("--model", required=True, type=str, help="...")
239 parser_extract.add_argument("--modeltype", 243 parser_extract.add_argument("--modeltype",
240 required=True, 244 required=True,
241 choices=[key for key in CLUSTERING_METHODS], 245 choices=[key for key in CLUSTERING_METHODS],
242 help="type of model for learning") 246 help="type of model for learning")
243 parser_extract.add_argument("--outfile", required=True, type=str, help="...") 247 parser_extract.add_argument("--outfile", required=True, type=str, help="...")
244 parser_extract.set_defaults(which="extract") 248 parser_extract.set_defaults(which="extract")
245 249
246 # Parse 250 # Parse
247 args = parser.parse_args() 251 args = parser.parse_args()
248 252
249 # Run commands 253 # Run commands
250 runner = SubCommandRunner({ 254 runner = SubCommandRunner({
251 "kmeans": kmeans_run, 255 "kmeans": kmeans_run,
252 "measure": measure_run, 256 "measure": measure_run,
253 "disequilibrium": disequilibrium_run, 257 "disequilibrium": disequilibrium_run,
254 "extract": extract_run 258 "extract": extract_run