Commit 88d1d67e9da778e3e241f5775447bdaa7f2da76a

Authored by quillotm
1 parent 660d9960f9
Exists in master

Removing todo comments

Showing 1 changed file with 0 additions and 2 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis 9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
10 10
11 from sklearn.preprocessing import LabelEncoder 11 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
13 13
14 import core.measures 14 import core.measures
15 import json 15 import json
16 16
17 17
18 CLUSTERING_METHODS = { 18 CLUSTERING_METHODS = {
19 "k-means": kmeans(), 19 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis() 20 "k-means-mahalanobis": kmeansMahalanobis()
21 } 21 }
22 22
23 EVALUATION_METHODS = { 23 EVALUATION_METHODS = {
24 "entropy": core.measures.entropy_score, 24 "entropy": core.measures.entropy_score,
25 "purity": core.measures.purity_score, 25 "purity": core.measures.purity_score,
26 "v-measure": v_measure_score, 26 "v-measure": v_measure_score,
27 "homogeneity": homogeneity_score, 27 "homogeneity": homogeneity_score,
28 "completeness": completeness_score, 28 "completeness": completeness_score,
29 } 29 }
30 30
31 31
32 def disequilibrium_run(): 32 def disequilibrium_run():
33 pass 33 pass
34 34
35 35
36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
37 """ 37 """
38 38
39 @param measure: 39 @param measure:
40 @param features: 40 @param features:
41 @param lst: 41 @param lst:
42 @param truelabels: 42 @param truelabels:
43 @param model: 43 @param model:
44 @param modeltype: 44 @param modeltype:
45 @return: 45 @return:
46 """ 46 """
47 module = CLUSTERING_METHODS[modeltype] 47 module = CLUSTERING_METHODS[modeltype]
48 module.load(model) 48 module.load(model)
49 49
50 eval = {} 50 eval = {}
51 for ms in measure: 51 for ms in measure:
52 evaluation = EVALUATION_METHODS[ms] 52 evaluation = EVALUATION_METHODS[ms]
53 feats_dict = read_features(features) 53 feats_dict = read_features(features)
54 labels_dict = read_labels(truelabels) 54 labels_dict = read_labels(truelabels)
55 lst_dict = read_lst(lst) 55 lst_dict = read_lst(lst)
56 lst_keys = [key for key in lst_dict] 56 lst_keys = [key for key in lst_dict]
57 feats = np.asarray([feats_dict[key] for key in lst_keys]) 57 feats = np.asarray([feats_dict[key] for key in lst_keys])
58 Y_pred = module.predict(feats) 58 Y_pred = module.predict(feats)
59 Y_truth = [labels_dict[key][0] for key in lst_keys] 59 Y_truth = [labels_dict[key][0] for key in lst_keys]
60 60
61 le = LabelEncoder() 61 le = LabelEncoder()
62 le.fit(Y_truth) 62 le.fit(Y_truth)
63 Y_truth = le.transform(Y_truth) 63 Y_truth = le.transform(Y_truth)
64 64
65 eval[ms] = evaluation(Y_truth, Y_pred) 65 eval[ms] = evaluation(Y_truth, Y_pred)
66 66
67 print(json.dumps(eval)) 67 print(json.dumps(eval))
68 68
69 69
70 def kmeans_run(features: str, 70 def kmeans_run(features: str,
71 lst: str, 71 lst: str,
72 k:int, 72 k:int,
73 kmax: int, 73 kmax: int,
74 klist, 74 klist,
75 maxiter: int, 75 maxiter: int,
76 ninit: int, 76 ninit: int,
77 output: str, 77 output: str,
78 tol: float, 78 tol: float,
79 debug: bool = False, 79 debug: bool = False,
80 mahalanobis: str = False): 80 mahalanobis: str = False):
81 """ 81 """
82 82
83 @param features: output features 83 @param features: output features
84 @param lst: list file 84 @param lst: list file
85 @param k: k (kmin if kmax specified) 85 @param k: k (kmin if kmax specified)
86 @param kmax: maximum k to compute 86 @param kmax: maximum k to compute
87 @param klist: list of k values to compute, ignore k value 87 @param klist: list of k values to compute, ignore k value
88 @param output: output file if kmax not specified, else, output directory 88 @param output: output file if kmax not specified, else, output directory
89 @param mahalanobis: distance option of k-means. 89 @param mahalanobis: distance option of k-means.
90 """ 90 """
91 json_content = locals().copy() 91 json_content = locals().copy()
92 92
93 def fit_model(k: int, output_file): 93 def fit_model(k: int, output_file):
94 if debug: 94 if debug:
95 print(f"Computing clustering with k={k}") 95 print(f"Computing clustering with k={k}")
96 model = CLUSTERING_METHODS["k-means"] 96 model = CLUSTERING_METHODS["k-means"]
97 if mahalanobis: 97 if mahalanobis:
98 if debug: 98 if debug:
99 print("Mahalanobis activated") 99 print("Mahalanobis activated")
100 model = CLUSTERING_METHODS["k-means-mahalanobis"] 100 model = CLUSTERING_METHODS["k-means-mahalanobis"]
101 model.fit(X, k, tol, ninit, maxiter, debug) 101 model.fit(X, k, tol, ninit, maxiter, debug)
102 model.save(output_file) 102 model.save(output_file)
103 json_content["models"].append({ 103 json_content["models"].append({
104 "model_file": output_file, 104 "model_file": output_file,
105 "k": k, 105 "k": k,
106 }) 106 })
107 107
108 json_content["models"] = [] 108 json_content["models"] = []
109 109
110 # -- READ FILES -- 110 # -- READ FILES --
111 features_dict = read_features(features) 111 features_dict = read_features(features)
112 lst_dict = read_lst(lst) 112 lst_dict = read_lst(lst)
113 X = np.asarray([features_dict[x] for x in lst_dict]) 113 X = np.asarray([features_dict[x] for x in lst_dict])
114 114
115 # Exception cases 115 # Exception cases
116 if kmax is None and klist is None and path.isdir(output): 116 if kmax is None and klist is None and path.isdir(output):
117 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 117 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
118 118
119 if (kmax is not None or klist is not None) and path.isfile(output): 119 if (kmax is not None or klist is not None) and path.isfile(output):
120 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 120 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
121 121
122 # Mono value case 122 # Mono value case
123 if kmax is None and klist is None: 123 if kmax is None and klist is None:
124 fit_model(k, output) 124 fit_model(k, output)
125 125
126 # Multi values case with kmax 126 # Multi values case with kmax
127 if kmax is not None: 127 if kmax is not None:
128 if not path.isdir(output): 128 if not path.isdir(output):
129 mkdir(output) 129 mkdir(output)
130 Ks = range(k, kmax + 1) 130 Ks = range(k, kmax + 1)
131 for i in Ks: 131 for i in Ks:
132 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) 132 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
133 133
134 # Second multi values case with klist 134 # Second multi values case with klist
135 if klist is not None: 135 if klist is not None:
136 if not path.isdir(output): 136 if not path.isdir(output):
137 mkdir(output) 137 mkdir(output)
138 for k in klist: 138 for k in klist:
139 k = int(k) 139 k = int(k)
140 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) 140 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
141 141
142 print(json_content) 142 print(json_content)
143 # TODO: compute loss with k-means mahalanobis.
144 # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
145 143
146 144
147 if __name__ == "__main__": 145 if __name__ == "__main__":
148 # Main parser 146 # Main parser
149 parser = argparse.ArgumentParser(description="Clustering methods to apply") 147 parser = argparse.ArgumentParser(description="Clustering methods to apply")
150 subparsers = parser.add_subparsers(title="action") 148 subparsers = parser.add_subparsers(title="action")
151 149
152 # kmeans 150 # kmeans
153 parser_kmeans = subparsers.add_parser( 151 parser_kmeans = subparsers.add_parser(
154 "kmeans", help="Compute clustering using k-means algorithm") 152 "kmeans", help="Compute clustering using k-means algorithm")
155 153
156 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 154 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
157 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 155 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
158 parser_kmeans.add_argument("-k", default=2, type=int, 156 parser_kmeans.add_argument("-k", default=2, type=int,
159 help="number of clusters to compute. It is kmin if kmax is specified.") 157 help="number of clusters to compute. It is kmin if kmax is specified.")
160 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 158 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
161 parser_kmeans.add_argument("--klist", nargs="+", 159 parser_kmeans.add_argument("--klist", nargs="+",
162 help="List of k values to test. As kmax, activate the multi values mod.") 160 help="List of k values to test. As kmax, activate the multi values mod.")
163 parser_kmeans.add_argument("--maxiter", 161 parser_kmeans.add_argument("--maxiter",
164 type=int, 162 type=int,
165 default=300, 163 default=300,
166 help="Max number of iteration before stoping if not converging") 164 help="Max number of iteration before stoping if not converging")
167 parser_kmeans.add_argument("--ninit", 165 parser_kmeans.add_argument("--ninit",
168 type=int, 166 type=int,
169 default=10, 167 default=10,
170 help="Number of time the k-means algorithm will be run with different centroid seeds.") 168 help="Number of time the k-means algorithm will be run with different centroid seeds.")
171 parser_kmeans.add_argument("--tol", 169 parser_kmeans.add_argument("--tol",
172 type=float, 170 type=float,
173 default=0.0001, 171 default=0.0001,
174 help="Tolerance to finish of distance between centroids and their updates.") 172 help="Tolerance to finish of distance between centroids and their updates.")
175 parser_kmeans.add_argument("--debug", action="store_true") 173 parser_kmeans.add_argument("--debug", action="store_true")
176 parser_kmeans.add_argument("--output", 174 parser_kmeans.add_argument("--output",
177 default=".kmeans", 175 default=".kmeans",
178 help="output file if only k. Output directory if multiple kmax specified.") 176 help="output file if only k. Output directory if multiple kmax specified.")
179 parser_kmeans.add_argument("--mahalanobis", action="store_true") 177 parser_kmeans.add_argument("--mahalanobis", action="store_true")
180 parser_kmeans.set_defaults(which="kmeans") 178 parser_kmeans.set_defaults(which="kmeans")
181 179
182 # measure 180 # measure
183 parser_measure = subparsers.add_parser( 181 parser_measure = subparsers.add_parser(
184 "measure", help="compute the entropy") 182 "measure", help="compute the entropy")
185 183
186 parser_measure.add_argument("--measure", 184 parser_measure.add_argument("--measure",
187 required=True, 185 required=True,
188 nargs="+", 186 nargs="+",
189 choices=[key for key in EVALUATION_METHODS], 187 choices=[key for key in EVALUATION_METHODS],
190 help="...") 188 help="...")
191 parser_measure.add_argument("--features", required=True, type=str, help="...") 189 parser_measure.add_argument("--features", required=True, type=str, help="...")
192 parser_measure.add_argument("--lst", required=True, type=str, help="...") 190 parser_measure.add_argument("--lst", required=True, type=str, help="...")
193 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 191 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
194 parser_measure.add_argument("--model", required=True, type=str, help="...") 192 parser_measure.add_argument("--model", required=True, type=str, help="...")
195 parser_measure.add_argument("--modeltype", 193 parser_measure.add_argument("--modeltype",
196 required=True, 194 required=True,
197 choices=[key for key in CLUSTERING_METHODS], 195 choices=[key for key in CLUSTERING_METHODS],
198 help="type of model for learning") 196 help="type of model for learning")
199 parser_measure.set_defaults(which="measure") 197 parser_measure.set_defaults(which="measure")
200 198
201 # disequilibrium 199 # disequilibrium
202 parser_disequilibrium = subparsers.add_parser( 200 parser_disequilibrium = subparsers.add_parser(
203 "disequilibrium", help="...") 201 "disequilibrium", help="...")
204 202
205 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 203 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
206 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 204 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
207 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 205 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
208 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 206 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
209 parser_disequilibrium.add_argument("--model-type", 207 parser_disequilibrium.add_argument("--model-type",
210 required=True, 208 required=True,
211 choices=["kmeans", "2", "3"], 209 choices=["kmeans", "2", "3"],
212 help="...") 210 help="...")
213 parser_disequilibrium.set_defaults(which="disequilibrium") 211 parser_disequilibrium.set_defaults(which="disequilibrium")
214 212
215 # Parse 213 # Parse
216 args = parser.parse_args() 214 args = parser.parse_args()
217 215
218 # Run commands 216 # Run commands
219 runner = SubCommandRunner({ 217 runner = SubCommandRunner({
220 "kmeans": kmeans_run, 218 "kmeans": kmeans_run,
221 "measure": measure_run, 219 "measure": measure_run,
222 "disequilibrium": disequilibrium_run 220 "disequilibrium": disequilibrium_run
223 }) 221 })
224 222
225 runner.run(args.which, args.__dict__, remove="which") 223 runner.run(args.which, args.__dict__, remove="which")
226 224