Commit ef499b777c4665a9384b4167457abc2a17baf833

Authored by quillotm
1 parent 1bcb37e33d
Exists in master

Now we can extract labels and save them in a file. Useful to learn other systems…

… based on these labels. (or to create plots)

Showing 1 changed file with 36 additions and 3 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels, write_line
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis 9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
10 10
11 from sklearn.preprocessing import LabelEncoder 11 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
13 13
14 import core.measures 14 import core.measures
15 import json 15 import json
16 16
17 17
18 CLUSTERING_METHODS = { 18 CLUSTERING_METHODS = {
19 "k-means": kmeans(), 19 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis(), 20 "k-means-mahalanobis": kmeansMahalanobis(),
21 "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) 21 "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)
22 } 22 }
23 23
24 EVALUATION_METHODS = { 24 EVALUATION_METHODS = {
25 "entropy": core.measures.entropy_score, 25 "entropy": core.measures.entropy_score,
26 "purity": core.measures.purity_score, 26 "purity": core.measures.purity_score,
27 "v-measure": v_measure_score, 27 "v-measure": v_measure_score,
28 "homogeneity": homogeneity_score, 28 "homogeneity": homogeneity_score,
29 "completeness": completeness_score, 29 "completeness": completeness_score,
30 } 30 }
31 31
32 32
33 def disequilibrium_run(): 33 def disequilibrium_run():
34 pass 34 pass
35 35
36 36
37 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 37 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
38 """ 38 """
39 39
40 @param measure: 40 @param measure:
41 @param features: 41 @param features:
42 @param lst: 42 @param lst:
43 @param truelabels: 43 @param truelabels:
44 @param model: 44 @param model:
45 @param modeltype: 45 @param modeltype:
46 @return: 46 @return:
47 """ 47 """
48 module = CLUSTERING_METHODS[modeltype] 48 module = CLUSTERING_METHODS[modeltype]
49 module.load(model) 49 module.load(model)
50 50
51 eval = {} 51 eval = {}
52 for ms in measure: 52 for ms in measure:
53 evaluation = EVALUATION_METHODS[ms] 53 evaluation = EVALUATION_METHODS[ms]
54 feats_dict = read_features(features) 54 feats_dict = read_features(features)
55 labels_dict = read_labels(truelabels) 55 labels_dict = read_labels(truelabels)
56 lst_dict = read_lst(lst) 56 lst_dict = read_lst(lst)
57 lst_keys = [key for key in lst_dict] 57 lst_keys = [key for key in lst_dict]
58 feats = np.asarray([feats_dict[key] for key in lst_keys]) 58 feats = np.asarray([feats_dict[key] for key in lst_keys])
59 Y_pred = module.predict(feats) 59 Y_pred = module.predict(feats)
60 Y_truth = [labels_dict[key][0] for key in lst_keys] 60 Y_truth = [labels_dict[key][0] for key in lst_keys]
61 61
62 le = LabelEncoder() 62 le = LabelEncoder()
63 le.fit(Y_truth) 63 le.fit(Y_truth)
64 Y_truth = le.transform(Y_truth) 64 Y_truth = le.transform(Y_truth)
65 65
66 eval[ms] = evaluation(Y_truth, Y_pred) 66 eval[ms] = evaluation(Y_truth, Y_pred)
67 67
68 print(json.dumps(eval)) 68 print(json.dumps(eval))
69 69
70 70
71 def kmeans_run(features: str, 71 def kmeans_run(features: str,
72 lst: str, 72 lst: str,
73 k:int, 73 k:int,
74 kmax: int, 74 kmax: int,
75 klist, 75 klist,
76 maxiter: int, 76 maxiter: int,
77 ninit: int, 77 ninit: int,
78 output: str, 78 output: str,
79 tol: float, 79 tol: float,
80 debug: bool = False, 80 debug: bool = False,
81 mahalanobis: str = False): 81 mahalanobis: str = False):
82 """ 82 """
83 83
84 @param features: output features 84 @param features: output features
85 @param lst: list file 85 @param lst: list file
86 @param k: k (kmin if kmax specified) 86 @param k: k (kmin if kmax specified)
87 @param kmax: maximum k to compute 87 @param kmax: maximum k to compute
88 @param klist: list of k values to compute, ignore k value 88 @param klist: list of k values to compute, ignore k value
89 @param output: output file if kmax not specified, else, output directory 89 @param output: output file if kmax not specified, else, output directory
90 @param mahalanobis: distance option of k-means. 90 @param mahalanobis: distance option of k-means.
91 """ 91 """
92 json_content = locals().copy() 92 json_content = locals().copy()
93 93
94 def fit_model(k: int, output_file): 94 def fit_model(k: int, output_file):
95 if debug: 95 if debug:
96 print(f"Computing clustering with k={k}") 96 print(f"Computing clustering with k={k}")
97 model = CLUSTERING_METHODS["k-means"] 97 model = CLUSTERING_METHODS["k-means"]
98 if mahalanobis: 98 if mahalanobis:
99 if debug: 99 if debug:
100 print("Mahalanobis activated") 100 print("Mahalanobis activated")
101 model = CLUSTERING_METHODS["k-means-mahalanobis"] 101 model = CLUSTERING_METHODS["k-means-mahalanobis"]
102 model.fit(X, k, tol, ninit, maxiter, debug) 102 model.fit(X, k, tol, ninit, maxiter, debug)
103 model.save(output_file) 103 model.save(output_file)
104 json_content["models"].append({ 104 json_content["models"].append({
105 "model_file": output_file, 105 "model_file": output_file,
106 "k": k, 106 "k": k,
107 }) 107 })
108 108
109 json_content["models"] = [] 109 json_content["models"] = []
110 110
111 # -- READ FILES -- 111 # -- READ FILES --
112 features_dict = read_features(features) 112 features_dict = read_features(features)
113 lst_dict = read_lst(lst) 113 lst_dict = read_lst(lst)
114 X = np.asarray([features_dict[x] for x in lst_dict]) 114 X = np.asarray([features_dict[x] for x in lst_dict])
115 115
116 # Exception cases 116 # Exception cases
117 if kmax is None and klist is None and path.isdir(output): 117 if kmax is None and klist is None and path.isdir(output):
118 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 118 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
119 119
120 if (kmax is not None or klist is not None) and path.isfile(output): 120 if (kmax is not None or klist is not None) and path.isfile(output):
121 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 121 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
122 122
123 # Mono value case 123 # Mono value case
124 if kmax is None and klist is None: 124 if kmax is None and klist is None:
125 fit_model(k, output) 125 fit_model(k, output)
126 126
127 # Multi values case with kmax 127 # Multi values case with kmax
128 if kmax is not None: 128 if kmax is not None:
129 if not path.isdir(output): 129 if not path.isdir(output):
130 mkdir(output) 130 mkdir(output)
131 Ks = range(k, kmax + 1) 131 Ks = range(k, kmax + 1)
132 for i in Ks: 132 for i in Ks:
133 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) 133 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
134 134
135 # Second multi values case with klist 135 # Second multi values case with klist
136 if klist is not None: 136 if klist is not None:
137 if not path.isdir(output): 137 if not path.isdir(output):
138 mkdir(output) 138 mkdir(output)
139 for k in klist: 139 for k in klist:
140 k = int(k) 140 k = int(k)
141 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) 141 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
142 142
143 print(json_content) 143 print(json.dumps(json_content))
144 144
145 145
146 def extract_run(features, lst, model, modeltype, outfile):
147 feats_dict = read_features(features)
148 lst_dict = read_lst(lst)
149 lst_keys = [key for key in lst_dict]
150 feats = np.asarray([feats_dict[key] for key in lst_keys])
151
152 module = CLUSTERING_METHODS[modeltype]
153 module.load(model)
154 Y_pred = module.predict(feats)
155 with open(outfile, "w") as f:
156 for i, key in enumerate(lst_keys):
157 write_line(key, Y_pred[i], f)
158 json_output = {
159 "outfile": outfile
160 }
161 print(json.dumps(json_output))
162
163
146 if __name__ == "__main__": 164 if __name__ == "__main__":
147 # Main parser 165 # Main parser
148 parser = argparse.ArgumentParser(description="Clustering methods to apply") 166 parser = argparse.ArgumentParser(description="Clustering methods to apply")
149 subparsers = parser.add_subparsers(title="action") 167 subparsers = parser.add_subparsers(title="action")
150 168
151 # kmeans 169 # kmeans
152 parser_kmeans = subparsers.add_parser( 170 parser_kmeans = subparsers.add_parser(
153 "kmeans", help="Compute clustering using k-means algorithm") 171 "kmeans", help="Compute clustering using k-means algorithm")
154 172
155 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 173 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
156 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 174 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
157 parser_kmeans.add_argument("-k", default=2, type=int, 175 parser_kmeans.add_argument("-k", default=2, type=int,
158 help="number of clusters to compute. It is kmin if kmax is specified.") 176 help="number of clusters to compute. It is kmin if kmax is specified.")
159 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 177 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
160 parser_kmeans.add_argument("--klist", nargs="+", 178 parser_kmeans.add_argument("--klist", nargs="+",
161 help="List of k values to test. As kmax, activate the multi values mod.") 179 help="List of k values to test. As kmax, activate the multi values mod.")
162 parser_kmeans.add_argument("--maxiter", 180 parser_kmeans.add_argument("--maxiter",
163 type=int, 181 type=int,
164 default=300, 182 default=300,
165 help="Max number of iteration before stoping if not converging") 183 help="Max number of iteration before stoping if not converging")
166 parser_kmeans.add_argument("--ninit", 184 parser_kmeans.add_argument("--ninit",
167 type=int, 185 type=int,
168 default=10, 186 default=10,
169 help="Number of time the k-means algorithm will be run with different centroid seeds.") 187 help="Number of time the k-means algorithm will be run with different centroid seeds.")
170 parser_kmeans.add_argument("--tol", 188 parser_kmeans.add_argument("--tol",
171 type=float, 189 type=float,
172 default=0.0001, 190 default=0.0001,
173 help="Tolerance to finish of distance between centroids and their updates.") 191 help="Tolerance to finish of distance between centroids and their updates.")
174 parser_kmeans.add_argument("--debug", action="store_true") 192 parser_kmeans.add_argument("--debug", action="store_true")
175 parser_kmeans.add_argument("--output", 193 parser_kmeans.add_argument("--output",
176 default=".kmeans", 194 default=".kmeans",
177 help="output file if only k. Output directory if multiple kmax specified.") 195 help="output file if only k. Output directory if multiple kmax specified.")
178 parser_kmeans.add_argument("--mahalanobis", action="store_true") 196 parser_kmeans.add_argument("--mahalanobis", action="store_true")
179 parser_kmeans.set_defaults(which="kmeans") 197 parser_kmeans.set_defaults(which="kmeans")
180 198
181 # measure 199 # measure
182 parser_measure = subparsers.add_parser( 200 parser_measure = subparsers.add_parser(
183 "measure", help="compute the entropy") 201 "measure", help="compute the entropy")
184 202
185 parser_measure.add_argument("--measure", 203 parser_measure.add_argument("--measure",
186 required=True, 204 required=True,
187 nargs="+", 205 nargs="+",
188 choices=[key for key in EVALUATION_METHODS], 206 choices=[key for key in EVALUATION_METHODS],
189 help="...") 207 help="...")
190 parser_measure.add_argument("--features", required=True, type=str, help="...") 208 parser_measure.add_argument("--features", required=True, type=str, help="...")
191 parser_measure.add_argument("--lst", required=True, type=str, help="...") 209 parser_measure.add_argument("--lst", required=True, type=str, help="...")
192 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 210 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
193 parser_measure.add_argument("--model", required=True, type=str, help="...") 211 parser_measure.add_argument("--model", required=True, type=str, help="...")
194 parser_measure.add_argument("--modeltype", 212 parser_measure.add_argument("--modeltype",
195 required=True, 213 required=True,
196 choices=[key for key in CLUSTERING_METHODS], 214 choices=[key for key in CLUSTERING_METHODS],
197 help="type of model for learning") 215 help="type of model for learning")
198 parser_measure.set_defaults(which="measure") 216 parser_measure.set_defaults(which="measure")
199 217
200 # disequilibrium 218 # disequilibrium
201 parser_disequilibrium = subparsers.add_parser( 219 parser_disequilibrium = subparsers.add_parser(
202 "disequilibrium", help="...") 220 "disequilibrium", help="...")
203 221
204 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 222 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
205 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 223 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
206 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 224 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
207 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 225 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
208 parser_disequilibrium.add_argument("--model-type", 226 parser_disequilibrium.add_argument("--model-type",
209 required=True, 227 required=True,
210 choices=["kmeans", "2", "3"], 228 choices=["kmeans", "2", "3"],
211 help="...") 229 help="...")
212 parser_disequilibrium.set_defaults(which="disequilibrium") 230 parser_disequilibrium.set_defaults(which="disequilibrium")
213 231
232 # Extract
233 parser_extract = subparsers.add_parser(
234 "extract", help="extract cluster labels")
235
236 parser_extract.add_argument("--features", required=True, type=str, help="...")
237 parser_extract.add_argument("--lst", required=True, type=str, help="...")
238 parser_extract.add_argument("--model", required=True, type=str, help="...")
239 parser_extract.add_argument("--modeltype",
240 required=True,
241 choices=[key for key in CLUSTERING_METHODS],
242 help="type of model for learning")
243 parser_extract.add_argument("--outfile", required=True, type=str, help="...")
244 parser_extract.set_defaults(which="extract")
245
214 # Parse 246 # Parse
215 args = parser.parse_args() 247 args = parser.parse_args()
216 248
217 # Run commands 249 # Run commands
218 runner = SubCommandRunner({ 250 runner = SubCommandRunner({
219 "kmeans": kmeans_run, 251 "kmeans": kmeans_run,
220 "measure": measure_run, 252 "measure": measure_run,
221 "disequilibrium": disequilibrium_run 253 "disequilibrium": disequilibrium_run,
254 "extract": extract_run
222 }) 255 })
223 256
224 runner.run(args.which, args.__dict__, remove="which") 257 runner.run(args.which, args.__dict__, remove="which")
225 258