Commit 3e2abe83e33bc90ce6e11f0ab38fd27a80b63284
1 parent
e828890879
Exists in
master
Multiple output for measure action. These multiple output are written in json.
Showing 1 changed file with 29 additions and 14 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | 9 | ||
10 | from sklearn.preprocessing import LabelEncoder | 10 | from sklearn.preprocessing import LabelEncoder |
11 | from sklearn.metrics import v_measure_score | 11 | from sklearn.metrics import v_measure_score |
12 | 12 | ||
13 | import core.measures | 13 | import core.measures |
14 | import json | ||
14 | 15 | ||
15 | 16 | ||
16 | CLUSTERING_METHODS = { | 17 | CLUSTERING_METHODS = { |
17 | "k-means": kmeans() | 18 | "k-means": kmeans() |
18 | } | 19 | } |
19 | 20 | ||
20 | EVALUATION_METHODS = { | 21 | EVALUATION_METHODS = { |
21 | "entropy": core.measures.entropy_score, | 22 | "entropy": core.measures.entropy_score, |
22 | "v-measure": v_measure_score | 23 | "v-measure": v_measure_score |
23 | } | 24 | } |
24 | 25 | ||
25 | 26 | ||
26 | def disequilibrium_run(): | 27 | def disequilibrium_run(): |
27 | pass | 28 | pass |
28 | 29 | ||
29 | 30 | ||
30 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 31 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
32 | """ | ||
33 | |||
34 | @param measure: | ||
35 | @param features: | ||
36 | @param lst: | ||
37 | @param truelabels: | ||
38 | @param model: | ||
39 | @param modeltype: | ||
40 | @return: | ||
41 | """ | ||
31 | module = CLUSTERING_METHODS[modeltype] | 42 | module = CLUSTERING_METHODS[modeltype] |
32 | module.load(model) | 43 | module.load(model) |
33 | evaluation = EVALUATION_METHODS[measure] | ||
34 | feats_dict = read_features(features) | ||
35 | labels_dict = read_labels(truelabels) | ||
36 | lst_dict = read_lst(lst) | ||
37 | lst_keys = [key for key in lst_dict] | ||
38 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | ||
39 | Y_pred = module.predict(feats) | ||
40 | Y_truth = [labels_dict[key][0] for key in lst_keys] | ||
41 | 44 | ||
42 | le = LabelEncoder() | 45 | eval = {} |
43 | le.fit(Y_truth) | 46 | for ms in measure: |
44 | Y_truth = le.transform(Y_truth) | 47 | evaluation = EVALUATION_METHODS[ms] |
48 | feats_dict = read_features(features) | ||
49 | labels_dict = read_labels(truelabels) | ||
50 | lst_dict = read_lst(lst) | ||
51 | lst_keys = [key for key in lst_dict] | ||
52 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | ||
53 | Y_pred = module.predict(feats) | ||
54 | Y_truth = [labels_dict[key][0] for key in lst_keys] | ||
45 | 55 | ||
46 | eval = evaluation(Y_truth, Y_pred) | 56 | le = LabelEncoder() |
47 | print(eval) | 57 | le.fit(Y_truth) |
58 | Y_truth = le.transform(Y_truth) | ||
48 | 59 | ||
60 | eval[ms] = evaluation(Y_truth, Y_pred) | ||
49 | 61 | ||
62 | print(json.dumps(eval)) | ||
50 | 63 | ||
64 | |||
65 | |||
51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | 66 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
52 | """ | 67 | """ |
53 | 68 | ||
54 | @param features: output features | 69 | @param features: output features |
55 | @param lst: list file | 70 | @param lst: list file |
56 | @param k: k (kmin if kmax specified) | 71 | @param k: k (kmin if kmax specified) |
57 | @param kmax: maximum k to compute | 72 | @param kmax: maximum k to compute |
58 | @param klist: list of k values to compute, ignore k value | 73 | @param klist: list of k values to compute, ignore k value |
59 | @param output: output file if kmax not specified, else, output directory | 74 | @param output: output file if kmax not specified, else, output directory |
60 | """ | 75 | """ |
61 | # -- READ FILES -- | 76 | # -- READ FILES -- |
62 | features_dict = read_features(features) | 77 | features_dict = read_features(features) |
63 | lst_dict = read_lst(lst) | 78 | lst_dict = read_lst(lst) |
64 | X = np.asarray([features_dict[x] for x in lst_dict]) | 79 | X = np.asarray([features_dict[x] for x in lst_dict]) |
65 | 80 | ||
66 | # Exception cases | 81 | # Exception cases |
67 | if kmax is None and klist is None and path.isdir(output): | 82 | if kmax is None and klist is None and path.isdir(output): |
68 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 83 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
69 | 84 | ||
70 | if (kmax is not None or klist is not None) and path.isfile(output): | 85 | if (kmax is not None or klist is not None) and path.isfile(output): |
71 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 86 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
72 | 87 | ||
73 | # Mono value case | 88 | # Mono value case |
74 | if kmax is None and klist is None: | 89 | if kmax is None and klist is None: |
75 | print(f"Computing clustering with k={k}") | 90 | print(f"Computing clustering with k={k}") |
76 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 91 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
77 | preds = kmeans.predict(X) | 92 | preds = kmeans.predict(X) |
78 | pickle.dump(kmeans, open(output, "wb")) | 93 | pickle.dump(kmeans, open(output, "wb")) |
79 | 94 | ||
80 | # Multi values case with kmax | 95 | # Multi values case with kmax |
81 | if kmax is not None: | 96 | if kmax is not None: |
82 | if not path.isdir(output): | 97 | if not path.isdir(output): |
83 | mkdir(output) | 98 | mkdir(output) |
84 | Ks = range(k, kmax + 1) | 99 | Ks = range(k, kmax + 1) |
85 | for i in Ks: | 100 | for i in Ks: |
86 | print(f"Computing clustering with k={i}") | 101 | print(f"Computing clustering with k={i}") |
87 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | 102 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) |
88 | preds = kmeans.predict(X) | 103 | preds = kmeans.predict(X) |
89 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | 104 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) |
90 | 105 | ||
91 | # Second multi values case with klist | 106 | # Second multi values case with klist |
92 | if klist is not None: | 107 | if klist is not None: |
93 | if not path.isdir(output): | 108 | if not path.isdir(output): |
94 | mkdir(output) | 109 | mkdir(output) |
95 | for k in klist: | 110 | for k in klist: |
96 | k = int(k) | 111 | k = int(k) |
97 | print(f"Computing clustering with k={k}") | 112 | print(f"Computing clustering with k={k}") |
98 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 113 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
99 | preds = kmeans.predict(X) | 114 | preds = kmeans.predict(X) |
100 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | 115 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) |
101 | 116 | ||
102 | 117 | ||
103 | if __name__ == "__main__": | 118 | if __name__ == "__main__": |
104 | # Main parser | 119 | # Main parser |
105 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 120 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
106 | subparsers = parser.add_subparsers(title="action") | 121 | subparsers = parser.add_subparsers(title="action") |
107 | 122 | ||
108 | # kmeans | 123 | # kmeans |
109 | parser_kmeans = subparsers.add_parser( | 124 | parser_kmeans = subparsers.add_parser( |
110 | "kmeans", help="Compute clustering using k-means algorithm") | 125 | "kmeans", help="Compute clustering using k-means algorithm") |
111 | 126 | ||
112 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 127 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
113 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 128 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
114 | parser_kmeans.add_argument("-k", default=2, type=int, | 129 | parser_kmeans.add_argument("-k", default=2, type=int, |
115 | help="number of clusters to compute. It is kmin if kmax is specified.") | 130 | help="number of clusters to compute. It is kmin if kmax is specified.") |
116 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 131 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
117 | parser_kmeans.add_argument("--klist", nargs="+", | 132 | parser_kmeans.add_argument("--klist", nargs="+", |
118 | help="List of k values to test. As kmax, activate the multi values mod.") | 133 | help="List of k values to test. As kmax, activate the multi values mod.") |
119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | 134 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
120 | parser_kmeans.set_defaults(which="kmeans") | 135 | parser_kmeans.set_defaults(which="kmeans") |
121 | 136 | ||
122 | # measure | 137 | # measure |
123 | parser_measure = subparsers.add_parser( | 138 | parser_measure = subparsers.add_parser( |
124 | "measure", help="compute the entropy") | 139 | "measure", help="compute the entropy") |
125 | 140 | ||
126 | parser_measure.add_argument("--measure", | 141 | parser_measure.add_argument("--measure", |
127 | required=True, | 142 | required=True, |
128 | type=str, | 143 | nargs="+", |
129 | choices=[key for key in EVALUATION_METHODS], | 144 | choices=[key for key in EVALUATION_METHODS], |
130 | help="...") | 145 | help="...") |
131 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 146 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
132 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 147 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
133 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 148 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
134 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 149 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
135 | parser_measure.add_argument("--modeltype", | 150 | parser_measure.add_argument("--modeltype", |
136 | required=True, | 151 | required=True, |
137 | choices=[key for key in CLUSTERING_METHODS], | 152 | choices=[key for key in CLUSTERING_METHODS], |
138 | help="type of model for learning") | 153 | help="type of model for learning") |
139 | parser_measure.set_defaults(which="measure") | 154 | parser_measure.set_defaults(which="measure") |
140 | 155 | ||
141 | # disequilibrium | 156 | # disequilibrium |
142 | parser_disequilibrium = subparsers.add_parser( | 157 | parser_disequilibrium = subparsers.add_parser( |
143 | "disequilibrium", help="...") | 158 | "disequilibrium", help="...") |
144 | 159 | ||
145 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 160 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
146 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 161 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
147 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 162 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
148 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 163 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
149 | parser_disequilibrium.add_argument("--model-type", | 164 | parser_disequilibrium.add_argument("--model-type", |
150 | required=True, | 165 | required=True, |
151 | choices=["kmeans", "2", "3"], | 166 | choices=["kmeans", "2", "3"], |
152 | help="...") | 167 | help="...") |
153 | parser_disequilibrium.set_defaults(which="disequilibrium") | 168 | parser_disequilibrium.set_defaults(which="disequilibrium") |
154 | 169 | ||
155 | # Parse | 170 | # Parse |
156 | args = parser.parse_args() | 171 | args = parser.parse_args() |
157 | 172 | ||
158 | # Run commands | 173 | # Run commands |