Commit 88d1d67e9da778e3e241f5775447bdaa7f2da76a
1 parent
660d9960f9
Exists in
master
Removing todo comments
Showing 1 changed file with 0 additions and 2 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | 9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
10 | 10 | ||
11 | from sklearn.preprocessing import LabelEncoder | 11 | from sklearn.preprocessing import LabelEncoder |
12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score | 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
13 | 13 | ||
14 | import core.measures | 14 | import core.measures |
15 | import json | 15 | import json |
16 | 16 | ||
17 | 17 | ||
18 | CLUSTERING_METHODS = { | 18 | CLUSTERING_METHODS = { |
19 | "k-means": kmeans(), | 19 | "k-means": kmeans(), |
20 | "k-means-mahalanobis": kmeansMahalanobis() | 20 | "k-means-mahalanobis": kmeansMahalanobis() |
21 | } | 21 | } |
22 | 22 | ||
23 | EVALUATION_METHODS = { | 23 | EVALUATION_METHODS = { |
24 | "entropy": core.measures.entropy_score, | 24 | "entropy": core.measures.entropy_score, |
25 | "purity": core.measures.purity_score, | 25 | "purity": core.measures.purity_score, |
26 | "v-measure": v_measure_score, | 26 | "v-measure": v_measure_score, |
27 | "homogeneity": homogeneity_score, | 27 | "homogeneity": homogeneity_score, |
28 | "completeness": completeness_score, | 28 | "completeness": completeness_score, |
29 | } | 29 | } |
30 | 30 | ||
31 | 31 | ||
32 | def disequilibrium_run(): | 32 | def disequilibrium_run(): |
33 | pass | 33 | pass |
34 | 34 | ||
35 | 35 | ||
36 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 36 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
37 | """ | 37 | """ |
38 | 38 | ||
39 | @param measure: | 39 | @param measure: |
40 | @param features: | 40 | @param features: |
41 | @param lst: | 41 | @param lst: |
42 | @param truelabels: | 42 | @param truelabels: |
43 | @param model: | 43 | @param model: |
44 | @param modeltype: | 44 | @param modeltype: |
45 | @return: | 45 | @return: |
46 | """ | 46 | """ |
47 | module = CLUSTERING_METHODS[modeltype] | 47 | module = CLUSTERING_METHODS[modeltype] |
48 | module.load(model) | 48 | module.load(model) |
49 | 49 | ||
50 | eval = {} | 50 | eval = {} |
51 | for ms in measure: | 51 | for ms in measure: |
52 | evaluation = EVALUATION_METHODS[ms] | 52 | evaluation = EVALUATION_METHODS[ms] |
53 | feats_dict = read_features(features) | 53 | feats_dict = read_features(features) |
54 | labels_dict = read_labels(truelabels) | 54 | labels_dict = read_labels(truelabels) |
55 | lst_dict = read_lst(lst) | 55 | lst_dict = read_lst(lst) |
56 | lst_keys = [key for key in lst_dict] | 56 | lst_keys = [key for key in lst_dict] |
57 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 57 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
58 | Y_pred = module.predict(feats) | 58 | Y_pred = module.predict(feats) |
59 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 59 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
60 | 60 | ||
61 | le = LabelEncoder() | 61 | le = LabelEncoder() |
62 | le.fit(Y_truth) | 62 | le.fit(Y_truth) |
63 | Y_truth = le.transform(Y_truth) | 63 | Y_truth = le.transform(Y_truth) |
64 | 64 | ||
65 | eval[ms] = evaluation(Y_truth, Y_pred) | 65 | eval[ms] = evaluation(Y_truth, Y_pred) |
66 | 66 | ||
67 | print(json.dumps(eval)) | 67 | print(json.dumps(eval)) |
68 | 68 | ||
69 | 69 | ||
70 | def kmeans_run(features: str, | 70 | def kmeans_run(features: str, |
71 | lst: str, | 71 | lst: str, |
72 | k:int, | 72 | k:int, |
73 | kmax: int, | 73 | kmax: int, |
74 | klist, | 74 | klist, |
75 | maxiter: int, | 75 | maxiter: int, |
76 | ninit: int, | 76 | ninit: int, |
77 | output: str, | 77 | output: str, |
78 | tol: float, | 78 | tol: float, |
79 | debug: bool = False, | 79 | debug: bool = False, |
80 | mahalanobis: str = False): | 80 | mahalanobis: str = False): |
81 | """ | 81 | """ |
82 | 82 | ||
83 | @param features: output features | 83 | @param features: output features |
84 | @param lst: list file | 84 | @param lst: list file |
85 | @param k: k (kmin if kmax specified) | 85 | @param k: k (kmin if kmax specified) |
86 | @param kmax: maximum k to compute | 86 | @param kmax: maximum k to compute |
87 | @param klist: list of k values to compute, ignore k value | 87 | @param klist: list of k values to compute, ignore k value |
88 | @param output: output file if kmax not specified, else, output directory | 88 | @param output: output file if kmax not specified, else, output directory |
89 | @param mahalanobis: distance option of k-means. | 89 | @param mahalanobis: distance option of k-means. |
90 | """ | 90 | """ |
91 | json_content = locals().copy() | 91 | json_content = locals().copy() |
92 | 92 | ||
93 | def fit_model(k: int, output_file): | 93 | def fit_model(k: int, output_file): |
94 | if debug: | 94 | if debug: |
95 | print(f"Computing clustering with k={k}") | 95 | print(f"Computing clustering with k={k}") |
96 | model = CLUSTERING_METHODS["k-means"] | 96 | model = CLUSTERING_METHODS["k-means"] |
97 | if mahalanobis: | 97 | if mahalanobis: |
98 | if debug: | 98 | if debug: |
99 | print("Mahalanobis activated") | 99 | print("Mahalanobis activated") |
100 | model = CLUSTERING_METHODS["k-means-mahalanobis"] | 100 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
101 | model.fit(X, k, tol, ninit, maxiter, debug) | 101 | model.fit(X, k, tol, ninit, maxiter, debug) |
102 | model.save(output_file) | 102 | model.save(output_file) |
103 | json_content["models"].append({ | 103 | json_content["models"].append({ |
104 | "model_file": output_file, | 104 | "model_file": output_file, |
105 | "k": k, | 105 | "k": k, |
106 | }) | 106 | }) |
107 | 107 | ||
108 | json_content["models"] = [] | 108 | json_content["models"] = [] |
109 | 109 | ||
110 | # -- READ FILES -- | 110 | # -- READ FILES -- |
111 | features_dict = read_features(features) | 111 | features_dict = read_features(features) |
112 | lst_dict = read_lst(lst) | 112 | lst_dict = read_lst(lst) |
113 | X = np.asarray([features_dict[x] for x in lst_dict]) | 113 | X = np.asarray([features_dict[x] for x in lst_dict]) |
114 | 114 | ||
115 | # Exception cases | 115 | # Exception cases |
116 | if kmax is None and klist is None and path.isdir(output): | 116 | if kmax is None and klist is None and path.isdir(output): |
117 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 117 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
118 | 118 | ||
119 | if (kmax is not None or klist is not None) and path.isfile(output): | 119 | if (kmax is not None or klist is not None) and path.isfile(output): |
120 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 120 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
121 | 121 | ||
122 | # Mono value case | 122 | # Mono value case |
123 | if kmax is None and klist is None: | 123 | if kmax is None and klist is None: |
124 | fit_model(k, output) | 124 | fit_model(k, output) |
125 | 125 | ||
126 | # Multi values case with kmax | 126 | # Multi values case with kmax |
127 | if kmax is not None: | 127 | if kmax is not None: |
128 | if not path.isdir(output): | 128 | if not path.isdir(output): |
129 | mkdir(output) | 129 | mkdir(output) |
130 | Ks = range(k, kmax + 1) | 130 | Ks = range(k, kmax + 1) |
131 | for i in Ks: | 131 | for i in Ks: |
132 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) | 132 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) |
133 | 133 | ||
134 | # Second multi values case with klist | 134 | # Second multi values case with klist |
135 | if klist is not None: | 135 | if klist is not None: |
136 | if not path.isdir(output): | 136 | if not path.isdir(output): |
137 | mkdir(output) | 137 | mkdir(output) |
138 | for k in klist: | 138 | for k in klist: |
139 | k = int(k) | 139 | k = int(k) |
140 | fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) | 140 | fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) |
141 | 141 | ||
142 | print(json_content) | 142 | print(json_content) |
143 | # TODO: compute loss with k-means mahalanobis. | ||
144 | # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm. | ||
145 | 143 | ||
146 | 144 | ||
147 | if __name__ == "__main__": | 145 | if __name__ == "__main__": |
148 | # Main parser | 146 | # Main parser |
149 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 147 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
150 | subparsers = parser.add_subparsers(title="action") | 148 | subparsers = parser.add_subparsers(title="action") |
151 | 149 | ||
152 | # kmeans | 150 | # kmeans |
153 | parser_kmeans = subparsers.add_parser( | 151 | parser_kmeans = subparsers.add_parser( |
154 | "kmeans", help="Compute clustering using k-means algorithm") | 152 | "kmeans", help="Compute clustering using k-means algorithm") |
155 | 153 | ||
156 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 154 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
157 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 155 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
158 | parser_kmeans.add_argument("-k", default=2, type=int, | 156 | parser_kmeans.add_argument("-k", default=2, type=int, |
159 | help="number of clusters to compute. It is kmin if kmax is specified.") | 157 | help="number of clusters to compute. It is kmin if kmax is specified.") |
160 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 158 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
161 | parser_kmeans.add_argument("--klist", nargs="+", | 159 | parser_kmeans.add_argument("--klist", nargs="+", |
162 | help="List of k values to test. As kmax, activate the multi values mod.") | 160 | help="List of k values to test. As kmax, activate the multi values mod.") |
163 | parser_kmeans.add_argument("--maxiter", | 161 | parser_kmeans.add_argument("--maxiter", |
164 | type=int, | 162 | type=int, |
165 | default=300, | 163 | default=300, |
166 | help="Max number of iteration before stoping if not converging") | 164 | help="Max number of iteration before stoping if not converging") |
167 | parser_kmeans.add_argument("--ninit", | 165 | parser_kmeans.add_argument("--ninit", |
168 | type=int, | 166 | type=int, |
169 | default=10, | 167 | default=10, |
170 | help="Number of time the k-means algorithm will be run with different centroid seeds.") | 168 | help="Number of time the k-means algorithm will be run with different centroid seeds.") |
171 | parser_kmeans.add_argument("--tol", | 169 | parser_kmeans.add_argument("--tol", |
172 | type=float, | 170 | type=float, |
173 | default=0.0001, | 171 | default=0.0001, |
174 | help="Tolerance to finish of distance between centroids and their updates.") | 172 | help="Tolerance to finish of distance between centroids and their updates.") |
175 | parser_kmeans.add_argument("--debug", action="store_true") | 173 | parser_kmeans.add_argument("--debug", action="store_true") |
176 | parser_kmeans.add_argument("--output", | 174 | parser_kmeans.add_argument("--output", |
177 | default=".kmeans", | 175 | default=".kmeans", |
178 | help="output file if only k. Output directory if multiple kmax specified.") | 176 | help="output file if only k. Output directory if multiple kmax specified.") |
179 | parser_kmeans.add_argument("--mahalanobis", action="store_true") | 177 | parser_kmeans.add_argument("--mahalanobis", action="store_true") |
180 | parser_kmeans.set_defaults(which="kmeans") | 178 | parser_kmeans.set_defaults(which="kmeans") |
181 | 179 | ||
182 | # measure | 180 | # measure |
183 | parser_measure = subparsers.add_parser( | 181 | parser_measure = subparsers.add_parser( |
184 | "measure", help="compute the entropy") | 182 | "measure", help="compute the entropy") |
185 | 183 | ||
186 | parser_measure.add_argument("--measure", | 184 | parser_measure.add_argument("--measure", |
187 | required=True, | 185 | required=True, |
188 | nargs="+", | 186 | nargs="+", |
189 | choices=[key for key in EVALUATION_METHODS], | 187 | choices=[key for key in EVALUATION_METHODS], |
190 | help="...") | 188 | help="...") |
191 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 189 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
192 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 190 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
193 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 191 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
194 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 192 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
195 | parser_measure.add_argument("--modeltype", | 193 | parser_measure.add_argument("--modeltype", |
196 | required=True, | 194 | required=True, |
197 | choices=[key for key in CLUSTERING_METHODS], | 195 | choices=[key for key in CLUSTERING_METHODS], |
198 | help="type of model for learning") | 196 | help="type of model for learning") |
199 | parser_measure.set_defaults(which="measure") | 197 | parser_measure.set_defaults(which="measure") |
200 | 198 | ||
201 | # disequilibrium | 199 | # disequilibrium |
202 | parser_disequilibrium = subparsers.add_parser( | 200 | parser_disequilibrium = subparsers.add_parser( |
203 | "disequilibrium", help="...") | 201 | "disequilibrium", help="...") |
204 | 202 | ||
205 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 203 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
206 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 204 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
207 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 205 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
208 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 206 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
209 | parser_disequilibrium.add_argument("--model-type", | 207 | parser_disequilibrium.add_argument("--model-type", |
210 | required=True, | 208 | required=True, |
211 | choices=["kmeans", "2", "3"], | 209 | choices=["kmeans", "2", "3"], |
212 | help="...") | 210 | help="...") |
213 | parser_disequilibrium.set_defaults(which="disequilibrium") | 211 | parser_disequilibrium.set_defaults(which="disequilibrium") |
214 | 212 | ||
215 | # Parse | 213 | # Parse |
216 | args = parser.parse_args() | 214 | args = parser.parse_args() |
217 | 215 | ||
218 | # Run commands | 216 | # Run commands |
219 | runner = SubCommandRunner({ | 217 | runner = SubCommandRunner({ |
220 | "kmeans": kmeans_run, | 218 | "kmeans": kmeans_run, |
221 | "measure": measure_run, | 219 | "measure": measure_run, |
222 | "disequilibrium": disequilibrium_run | 220 | "disequilibrium": disequilibrium_run |
223 | }) | 221 | }) |
224 | 222 | ||
225 | runner.run(args.which, args.__dict__, remove="which") | 223 | runner.run(args.which, args.__dict__, remove="which") |
226 | 224 |