Commit fea9649a748844cca1201bf3612e6172f0ccfd19
1 parent
09f3471d67
Exists in
master
Add many measures to compute
Showing 1 changed file with 5 additions and 2 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | 9 | ||
10 | from sklearn.preprocessing import LabelEncoder | 10 | from sklearn.preprocessing import LabelEncoder |
11 | from sklearn.metrics import v_measure_score | 11 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
12 | 12 | ||
13 | import core.measures | 13 | import core.measures |
14 | import json | 14 | import json |
15 | 15 | ||
16 | 16 | ||
17 | CLUSTERING_METHODS = { | 17 | CLUSTERING_METHODS = { |
18 | "k-means": kmeans() | 18 | "k-means": kmeans() |
19 | } | 19 | } |
20 | 20 | ||
21 | EVALUATION_METHODS = { | 21 | EVALUATION_METHODS = { |
22 | "entropy": core.measures.entropy_score, | 22 | "entropy": core.measures.entropy_score, |
23 | "v-measure": v_measure_score | 23 | "purity": core.measures.purity_score, |
24 | "v-measure": v_measure_score, | ||
25 | "homogeneity": homogeneity_score, | ||
26 | "completeness": completeness_score, | ||
24 | } | 27 | } |
25 | 28 | ||
26 | 29 | ||
27 | def disequilibrium_run(): | 30 | def disequilibrium_run(): |
28 | pass | 31 | pass |
29 | 32 | ||
30 | 33 | ||
31 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 34 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
32 | """ | 35 | """ |
33 | 36 | ||
34 | @param measure: | 37 | @param measure: |
35 | @param features: | 38 | @param features: |
36 | @param lst: | 39 | @param lst: |
37 | @param truelabels: | 40 | @param truelabels: |
38 | @param model: | 41 | @param model: |
39 | @param modeltype: | 42 | @param modeltype: |
40 | @return: | 43 | @return: |
41 | """ | 44 | """ |
42 | module = CLUSTERING_METHODS[modeltype] | 45 | module = CLUSTERING_METHODS[modeltype] |
43 | module.load(model) | 46 | module.load(model) |
44 | 47 | ||
45 | eval = {} | 48 | eval = {} |
46 | for ms in measure: | 49 | for ms in measure: |
47 | evaluation = EVALUATION_METHODS[ms] | 50 | evaluation = EVALUATION_METHODS[ms] |
48 | feats_dict = read_features(features) | 51 | feats_dict = read_features(features) |
49 | labels_dict = read_labels(truelabels) | 52 | labels_dict = read_labels(truelabels) |
50 | lst_dict = read_lst(lst) | 53 | lst_dict = read_lst(lst) |
51 | lst_keys = [key for key in lst_dict] | 54 | lst_keys = [key for key in lst_dict] |
52 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 55 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
53 | Y_pred = module.predict(feats) | 56 | Y_pred = module.predict(feats) |
54 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 57 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
55 | 58 | ||
56 | le = LabelEncoder() | 59 | le = LabelEncoder() |
57 | le.fit(Y_truth) | 60 | le.fit(Y_truth) |
58 | Y_truth = le.transform(Y_truth) | 61 | Y_truth = le.transform(Y_truth) |
59 | 62 | ||
60 | eval[ms] = evaluation(Y_truth, Y_pred) | 63 | eval[ms] = evaluation(Y_truth, Y_pred) |
61 | 64 | ||
62 | print(json.dumps(eval)) | 65 | print(json.dumps(eval)) |
63 | 66 | ||
64 | 67 | ||
65 | 68 | ||
66 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | 69 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
67 | """ | 70 | """ |
68 | 71 | ||
69 | @param features: output features | 72 | @param features: output features |
70 | @param lst: list file | 73 | @param lst: list file |
71 | @param k: k (kmin if kmax specified) | 74 | @param k: k (kmin if kmax specified) |
72 | @param kmax: maximum k to compute | 75 | @param kmax: maximum k to compute |
73 | @param klist: list of k values to compute, ignore k value | 76 | @param klist: list of k values to compute, ignore k value |
74 | @param output: output file if kmax not specified, else, output directory | 77 | @param output: output file if kmax not specified, else, output directory |
75 | """ | 78 | """ |
76 | # -- READ FILES -- | 79 | # -- READ FILES -- |
77 | features_dict = read_features(features) | 80 | features_dict = read_features(features) |
78 | lst_dict = read_lst(lst) | 81 | lst_dict = read_lst(lst) |
79 | X = np.asarray([features_dict[x] for x in lst_dict]) | 82 | X = np.asarray([features_dict[x] for x in lst_dict]) |
80 | 83 | ||
81 | # Exception cases | 84 | # Exception cases |
82 | if kmax is None and klist is None and path.isdir(output): | 85 | if kmax is None and klist is None and path.isdir(output): |
83 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 86 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
84 | 87 | ||
85 | if (kmax is not None or klist is not None) and path.isfile(output): | 88 | if (kmax is not None or klist is not None) and path.isfile(output): |
86 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 89 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
87 | 90 | ||
88 | # Mono value case | 91 | # Mono value case |
89 | if kmax is None and klist is None: | 92 | if kmax is None and klist is None: |
90 | print(f"Computing clustering with k={k}") | 93 | print(f"Computing clustering with k={k}") |
91 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 94 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
92 | preds = kmeans.predict(X) | 95 | preds = kmeans.predict(X) |
93 | pickle.dump(kmeans, open(output, "wb")) | 96 | pickle.dump(kmeans, open(output, "wb")) |
94 | 97 | ||
95 | # Multi values case with kmax | 98 | # Multi values case with kmax |
96 | if kmax is not None: | 99 | if kmax is not None: |
97 | if not path.isdir(output): | 100 | if not path.isdir(output): |
98 | mkdir(output) | 101 | mkdir(output) |
99 | Ks = range(k, kmax + 1) | 102 | Ks = range(k, kmax + 1) |
100 | for i in Ks: | 103 | for i in Ks: |
101 | print(f"Computing clustering with k={i}") | 104 | print(f"Computing clustering with k={i}") |
102 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | 105 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) |
103 | preds = kmeans.predict(X) | 106 | preds = kmeans.predict(X) |
104 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | 107 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) |
105 | 108 | ||
106 | # Second multi values case with klist | 109 | # Second multi values case with klist |
107 | if klist is not None: | 110 | if klist is not None: |
108 | if not path.isdir(output): | 111 | if not path.isdir(output): |
109 | mkdir(output) | 112 | mkdir(output) |
110 | for k in klist: | 113 | for k in klist: |
111 | k = int(k) | 114 | k = int(k) |
112 | print(f"Computing clustering with k={k}") | 115 | print(f"Computing clustering with k={k}") |
113 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 116 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
114 | preds = kmeans.predict(X) | 117 | preds = kmeans.predict(X) |
115 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | 118 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) |
116 | 119 | ||
117 | 120 | ||
118 | if __name__ == "__main__": | 121 | if __name__ == "__main__": |
119 | # Main parser | 122 | # Main parser |
120 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 123 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
121 | subparsers = parser.add_subparsers(title="action") | 124 | subparsers = parser.add_subparsers(title="action") |
122 | 125 | ||
123 | # kmeans | 126 | # kmeans |
124 | parser_kmeans = subparsers.add_parser( | 127 | parser_kmeans = subparsers.add_parser( |
125 | "kmeans", help="Compute clustering using k-means algorithm") | 128 | "kmeans", help="Compute clustering using k-means algorithm") |
126 | 129 | ||
127 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 130 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
128 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 131 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
129 | parser_kmeans.add_argument("-k", default=2, type=int, | 132 | parser_kmeans.add_argument("-k", default=2, type=int, |
130 | help="number of clusters to compute. It is kmin if kmax is specified.") | 133 | help="number of clusters to compute. It is kmin if kmax is specified.") |
131 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 134 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
132 | parser_kmeans.add_argument("--klist", nargs="+", | 135 | parser_kmeans.add_argument("--klist", nargs="+", |
133 | help="List of k values to test. As kmax, activate the multi values mod.") | 136 | help="List of k values to test. As kmax, activate the multi values mod.") |
134 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | 137 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
135 | parser_kmeans.set_defaults(which="kmeans") | 138 | parser_kmeans.set_defaults(which="kmeans") |
136 | 139 | ||
137 | # measure | 140 | # measure |
138 | parser_measure = subparsers.add_parser( | 141 | parser_measure = subparsers.add_parser( |
139 | "measure", help="compute the entropy") | 142 | "measure", help="compute the entropy") |
140 | 143 | ||
141 | parser_measure.add_argument("--measure", | 144 | parser_measure.add_argument("--measure", |
142 | required=True, | 145 | required=True, |
143 | nargs="+", | 146 | nargs="+", |
144 | choices=[key for key in EVALUATION_METHODS], | 147 | choices=[key for key in EVALUATION_METHODS], |
145 | help="...") | 148 | help="...") |
146 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 149 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
147 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 150 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
148 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 151 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
149 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 152 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
150 | parser_measure.add_argument("--modeltype", | 153 | parser_measure.add_argument("--modeltype", |
151 | required=True, | 154 | required=True, |
152 | choices=[key for key in CLUSTERING_METHODS], | 155 | choices=[key for key in CLUSTERING_METHODS], |
153 | help="type of model for learning") | 156 | help="type of model for learning") |
154 | parser_measure.set_defaults(which="measure") | 157 | parser_measure.set_defaults(which="measure") |
155 | 158 | ||
156 | # disequilibrium | 159 | # disequilibrium |
157 | parser_disequilibrium = subparsers.add_parser( | 160 | parser_disequilibrium = subparsers.add_parser( |
158 | "disequilibrium", help="...") | 161 | "disequilibrium", help="...") |
159 | 162 | ||
160 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 163 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
161 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 164 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
162 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 165 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
163 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 166 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
164 | parser_disequilibrium.add_argument("--model-type", | 167 | parser_disequilibrium.add_argument("--model-type", |
165 | required=True, | 168 | required=True, |
166 | choices=["kmeans", "2", "3"], | 169 | choices=["kmeans", "2", "3"], |
167 | help="...") | 170 | help="...") |
168 | parser_disequilibrium.set_defaults(which="disequilibrium") | 171 | parser_disequilibrium.set_defaults(which="disequilibrium") |
169 | 172 | ||
170 | # Parse | 173 | # Parse |
171 | args = parser.parse_args() | 174 | args = parser.parse_args() |
172 | 175 | ||
173 | # Run commands | 176 | # Run commands |
174 | runner = SubCommandRunner({ | 177 | runner = SubCommandRunner({ |
175 | "kmeans": kmeans_run, | 178 | "kmeans": kmeans_run, |
176 | "measure": measure_run, | 179 | "measure": measure_run, |
177 | "disequilibrium": disequilibrium_run | 180 | "disequilibrium": disequilibrium_run |
178 | }) | 181 | }) |
179 | 182 | ||
180 | runner.run(args.which, args.__dict__, remove="which") | 183 | runner.run(args.which, args.__dict__, remove="which") |
181 | 184 |