Commit e828890879fbcb1b4426543be5a84a0dd31cf82a
1 parent
f0ca26aaf4
Exists in
master
Adding default value for argparse in the case of disequilibrium choice.
Showing 1 changed file with 1 additions and 0 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | 9 | ||
10 | from sklearn.preprocessing import LabelEncoder | 10 | from sklearn.preprocessing import LabelEncoder |
11 | from sklearn.metrics import v_measure_score | 11 | from sklearn.metrics import v_measure_score |
12 | 12 | ||
13 | import core.measures | 13 | import core.measures |
14 | 14 | ||
15 | 15 | ||
16 | CLUSTERING_METHODS = { | 16 | CLUSTERING_METHODS = { |
17 | "k-means": kmeans() | 17 | "k-means": kmeans() |
18 | } | 18 | } |
19 | 19 | ||
20 | EVALUATION_METHODS = { | 20 | EVALUATION_METHODS = { |
21 | "entropy": core.measures.entropy_score, | 21 | "entropy": core.measures.entropy_score, |
22 | "v-measure": v_measure_score | 22 | "v-measure": v_measure_score |
23 | } | 23 | } |
24 | 24 | ||
25 | 25 | ||
26 | def disequilibrium_run(): | 26 | def disequilibrium_run(): |
27 | pass | 27 | pass |
28 | 28 | ||
29 | 29 | ||
30 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 30 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
31 | module = CLUSTERING_METHODS[modeltype] | 31 | module = CLUSTERING_METHODS[modeltype] |
32 | module.load(model) | 32 | module.load(model) |
33 | evaluation = EVALUATION_METHODS[measure] | 33 | evaluation = EVALUATION_METHODS[measure] |
34 | feats_dict = read_features(features) | 34 | feats_dict = read_features(features) |
35 | labels_dict = read_labels(truelabels) | 35 | labels_dict = read_labels(truelabels) |
36 | lst_dict = read_lst(lst) | 36 | lst_dict = read_lst(lst) |
37 | lst_keys = [key for key in lst_dict] | 37 | lst_keys = [key for key in lst_dict] |
38 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 38 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
39 | Y_pred = module.predict(feats) | 39 | Y_pred = module.predict(feats) |
40 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 40 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
41 | 41 | ||
42 | le = LabelEncoder() | 42 | le = LabelEncoder() |
43 | le.fit(Y_truth) | 43 | le.fit(Y_truth) |
44 | Y_truth = le.transform(Y_truth) | 44 | Y_truth = le.transform(Y_truth) |
45 | 45 | ||
46 | eval = evaluation(Y_truth, Y_pred) | 46 | eval = evaluation(Y_truth, Y_pred) |
47 | print(eval) | 47 | print(eval) |
48 | 48 | ||
49 | 49 | ||
50 | 50 | ||
51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | 51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
52 | """ | 52 | """ |
53 | 53 | ||
54 | @param features: output features | 54 | @param features: output features |
55 | @param lst: list file | 55 | @param lst: list file |
56 | @param k: k (kmin if kmax specified) | 56 | @param k: k (kmin if kmax specified) |
57 | @param kmax: maximum k to compute | 57 | @param kmax: maximum k to compute |
58 | @param klist: list of k values to compute, ignore k value | 58 | @param klist: list of k values to compute, ignore k value |
59 | @param output: output file if kmax not specified, else, output directory | 59 | @param output: output file if kmax not specified, else, output directory |
60 | """ | 60 | """ |
61 | # -- READ FILES -- | 61 | # -- READ FILES -- |
62 | features_dict = read_features(features) | 62 | features_dict = read_features(features) |
63 | lst_dict = read_lst(lst) | 63 | lst_dict = read_lst(lst) |
64 | X = np.asarray([features_dict[x] for x in lst_dict]) | 64 | X = np.asarray([features_dict[x] for x in lst_dict]) |
65 | 65 | ||
66 | # Exception cases | 66 | # Exception cases |
67 | if kmax is None and klist is None and path.isdir(output): | 67 | if kmax is None and klist is None and path.isdir(output): |
68 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 68 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
69 | 69 | ||
70 | if (kmax is not None or klist is not None) and path.isfile(output): | 70 | if (kmax is not None or klist is not None) and path.isfile(output): |
71 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 71 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
72 | 72 | ||
73 | # Mono value case | 73 | # Mono value case |
74 | if kmax is None and klist is None: | 74 | if kmax is None and klist is None: |
75 | print(f"Computing clustering with k={k}") | 75 | print(f"Computing clustering with k={k}") |
76 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 76 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
77 | preds = kmeans.predict(X) | 77 | preds = kmeans.predict(X) |
78 | pickle.dump(kmeans, open(output, "wb")) | 78 | pickle.dump(kmeans, open(output, "wb")) |
79 | 79 | ||
80 | # Multi values case with kmax | 80 | # Multi values case with kmax |
81 | if kmax is not None: | 81 | if kmax is not None: |
82 | if not path.isdir(output): | 82 | if not path.isdir(output): |
83 | mkdir(output) | 83 | mkdir(output) |
84 | Ks = range(k, kmax + 1) | 84 | Ks = range(k, kmax + 1) |
85 | for i in Ks: | 85 | for i in Ks: |
86 | print(f"Computing clustering with k={i}") | 86 | print(f"Computing clustering with k={i}") |
87 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | 87 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) |
88 | preds = kmeans.predict(X) | 88 | preds = kmeans.predict(X) |
89 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | 89 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) |
90 | 90 | ||
91 | # Second multi values case with klist | 91 | # Second multi values case with klist |
92 | if klist is not None: | 92 | if klist is not None: |
93 | if not path.isdir(output): | 93 | if not path.isdir(output): |
94 | mkdir(output) | 94 | mkdir(output) |
95 | for k in klist: | 95 | for k in klist: |
96 | k = int(k) | 96 | k = int(k) |
97 | print(f"Computing clustering with k={k}") | 97 | print(f"Computing clustering with k={k}") |
98 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 98 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
99 | preds = kmeans.predict(X) | 99 | preds = kmeans.predict(X) |
100 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | 100 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) |
101 | 101 | ||
102 | 102 | ||
103 | if __name__ == "__main__": | 103 | if __name__ == "__main__": |
104 | # Main parser | 104 | # Main parser |
105 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 105 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
106 | subparsers = parser.add_subparsers(title="action") | 106 | subparsers = parser.add_subparsers(title="action") |
107 | 107 | ||
108 | # kmeans | 108 | # kmeans |
109 | parser_kmeans = subparsers.add_parser( | 109 | parser_kmeans = subparsers.add_parser( |
110 | "kmeans", help="Compute clustering using k-means algorithm") | 110 | "kmeans", help="Compute clustering using k-means algorithm") |
111 | 111 | ||
112 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 112 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
113 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 113 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
114 | parser_kmeans.add_argument("-k", default=2, type=int, | 114 | parser_kmeans.add_argument("-k", default=2, type=int, |
115 | help="number of clusters to compute. It is kmin if kmax is specified.") | 115 | help="number of clusters to compute. It is kmin if kmax is specified.") |
116 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 116 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
117 | parser_kmeans.add_argument("--klist", nargs="+", | 117 | parser_kmeans.add_argument("--klist", nargs="+", |
118 | help="List of k values to test. As kmax, activate the multi values mod.") | 118 | help="List of k values to test. As kmax, activate the multi values mod.") |
119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | 119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
120 | parser_kmeans.set_defaults(which="kmeans") | 120 | parser_kmeans.set_defaults(which="kmeans") |
121 | 121 | ||
122 | # measure | 122 | # measure |
123 | parser_measure = subparsers.add_parser( | 123 | parser_measure = subparsers.add_parser( |
124 | "measure", help="compute the entropy") | 124 | "measure", help="compute the entropy") |
125 | 125 | ||
126 | parser_measure.add_argument("--measure", | 126 | parser_measure.add_argument("--measure", |
127 | required=True, | 127 | required=True, |
128 | type=str, | 128 | type=str, |
129 | choices=[key for key in EVALUATION_METHODS], | 129 | choices=[key for key in EVALUATION_METHODS], |
130 | help="...") | 130 | help="...") |
131 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 131 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
132 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 132 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
133 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 133 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
134 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 134 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
135 | parser_measure.add_argument("--modeltype", | 135 | parser_measure.add_argument("--modeltype", |
136 | required=True, | 136 | required=True, |
137 | choices=[key for key in CLUSTERING_METHODS], | 137 | choices=[key for key in CLUSTERING_METHODS], |
138 | help="type of model for learning") | 138 | help="type of model for learning") |
139 | parser_measure.set_defaults(which="measure") | 139 | parser_measure.set_defaults(which="measure") |
140 | 140 | ||
141 | # disequilibrium | 141 | # disequilibrium |
142 | parser_disequilibrium = subparsers.add_parser( | 142 | parser_disequilibrium = subparsers.add_parser( |
143 | "disequilibrium", help="...") | 143 | "disequilibrium", help="...") |
144 | 144 | ||
145 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 145 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
146 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 146 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
147 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 147 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
148 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 148 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
149 | parser_disequilibrium.add_argument("--model-type", | 149 | parser_disequilibrium.add_argument("--model-type", |
150 | required=True, | 150 | required=True, |
151 | choices=["kmeans", "2", "3"], | 151 | choices=["kmeans", "2", "3"], |
152 | help="...") | 152 | help="...") |
153 | parser_disequilibrium.set_defaults(which="disequilibrium") | ||
153 | 154 | ||
154 | # Parse | 155 | # Parse |
155 | args = parser.parse_args() | 156 | args = parser.parse_args() |
156 | 157 | ||
157 | # Run commands | 158 | # Run commands |
158 | runner = SubCommandRunner({ | 159 | runner = SubCommandRunner({ |
159 | "kmeans": kmeans_run, | 160 | "kmeans": kmeans_run, |
160 | "measure": measure_run, | 161 | "measure": measure_run, |
161 | "disequilibrium": disequilibrium_run | 162 | "disequilibrium": disequilibrium_run |
162 | }) | 163 | }) |
163 | 164 | ||
164 | runner.run(args.which, args.__dict__, remove="which") | 165 | runner.run(args.which, args.__dict__, remove="which") |
165 | 166 |