Commit fea9649a748844cca1201bf3612e6172f0ccfd19

Authored by quillotm
1 parent 09f3471d67
Exists in master

Add many measures to compute

Showing 1 changed file with 5 additions and 2 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 9
10 from sklearn.preprocessing import LabelEncoder 10 from sklearn.preprocessing import LabelEncoder
11 from sklearn.metrics import v_measure_score 11 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
12 12
13 import core.measures 13 import core.measures
14 import json 14 import json
15 15
16 16
17 CLUSTERING_METHODS = { 17 CLUSTERING_METHODS = {
18 "k-means": kmeans() 18 "k-means": kmeans()
19 } 19 }
20 20
21 EVALUATION_METHODS = { 21 EVALUATION_METHODS = {
22 "entropy": core.measures.entropy_score, 22 "entropy": core.measures.entropy_score,
23 "v-measure": v_measure_score 23 "purity": core.measures.purity_score,
24 "v-measure": v_measure_score,
25 "homogeneity": homogeneity_score,
26 "completeness": completeness_score,
24 } 27 }
25 28
26 29
27 def disequilibrium_run(): 30 def disequilibrium_run():
28 pass 31 pass
29 32
30 33
31 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 34 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
32 """ 35 """
33 36
34 @param measure: 37 @param measure:
35 @param features: 38 @param features:
36 @param lst: 39 @param lst:
37 @param truelabels: 40 @param truelabels:
38 @param model: 41 @param model:
39 @param modeltype: 42 @param modeltype:
40 @return: 43 @return:
41 """ 44 """
42 module = CLUSTERING_METHODS[modeltype] 45 module = CLUSTERING_METHODS[modeltype]
43 module.load(model) 46 module.load(model)
44 47
45 eval = {} 48 eval = {}
46 for ms in measure: 49 for ms in measure:
47 evaluation = EVALUATION_METHODS[ms] 50 evaluation = EVALUATION_METHODS[ms]
48 feats_dict = read_features(features) 51 feats_dict = read_features(features)
49 labels_dict = read_labels(truelabels) 52 labels_dict = read_labels(truelabels)
50 lst_dict = read_lst(lst) 53 lst_dict = read_lst(lst)
51 lst_keys = [key for key in lst_dict] 54 lst_keys = [key for key in lst_dict]
52 feats = np.asarray([feats_dict[key] for key in lst_keys]) 55 feats = np.asarray([feats_dict[key] for key in lst_keys])
53 Y_pred = module.predict(feats) 56 Y_pred = module.predict(feats)
54 Y_truth = [labels_dict[key][0] for key in lst_keys] 57 Y_truth = [labels_dict[key][0] for key in lst_keys]
55 58
56 le = LabelEncoder() 59 le = LabelEncoder()
57 le.fit(Y_truth) 60 le.fit(Y_truth)
58 Y_truth = le.transform(Y_truth) 61 Y_truth = le.transform(Y_truth)
59 62
60 eval[ms] = evaluation(Y_truth, Y_pred) 63 eval[ms] = evaluation(Y_truth, Y_pred)
61 64
62 print(json.dumps(eval)) 65 print(json.dumps(eval))
63 66
64 67
65 68
66 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): 69 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
67 """ 70 """
68 71
69 @param features: output features 72 @param features: output features
70 @param lst: list file 73 @param lst: list file
71 @param k: k (kmin if kmax specified) 74 @param k: k (kmin if kmax specified)
72 @param kmax: maximum k to compute 75 @param kmax: maximum k to compute
73 @param klist: list of k values to compute, ignore k value 76 @param klist: list of k values to compute, ignore k value
74 @param output: output file if kmax not specified, else, output directory 77 @param output: output file if kmax not specified, else, output directory
75 """ 78 """
76 # -- READ FILES -- 79 # -- READ FILES --
77 features_dict = read_features(features) 80 features_dict = read_features(features)
78 lst_dict = read_lst(lst) 81 lst_dict = read_lst(lst)
79 X = np.asarray([features_dict[x] for x in lst_dict]) 82 X = np.asarray([features_dict[x] for x in lst_dict])
80 83
81 # Exception cases 84 # Exception cases
82 if kmax is None and klist is None and path.isdir(output): 85 if kmax is None and klist is None and path.isdir(output):
83 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 86 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
84 87
85 if (kmax is not None or klist is not None) and path.isfile(output): 88 if (kmax is not None or klist is not None) and path.isfile(output):
86 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 89 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
87 90
88 # Mono value case 91 # Mono value case
89 if kmax is None and klist is None: 92 if kmax is None and klist is None:
90 print(f"Computing clustering with k={k}") 93 print(f"Computing clustering with k={k}")
91 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 94 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
92 preds = kmeans.predict(X) 95 preds = kmeans.predict(X)
93 pickle.dump(kmeans, open(output, "wb")) 96 pickle.dump(kmeans, open(output, "wb"))
94 97
95 # Multi values case with kmax 98 # Multi values case with kmax
96 if kmax is not None: 99 if kmax is not None:
97 if not path.isdir(output): 100 if not path.isdir(output):
98 mkdir(output) 101 mkdir(output)
99 Ks = range(k, kmax + 1) 102 Ks = range(k, kmax + 1)
100 for i in Ks: 103 for i in Ks:
101 print(f"Computing clustering with k={i}") 104 print(f"Computing clustering with k={i}")
102 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) 105 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
103 preds = kmeans.predict(X) 106 preds = kmeans.predict(X)
104 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) 107 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
105 108
106 # Second multi values case with klist 109 # Second multi values case with klist
107 if klist is not None: 110 if klist is not None:
108 if not path.isdir(output): 111 if not path.isdir(output):
109 mkdir(output) 112 mkdir(output)
110 for k in klist: 113 for k in klist:
111 k = int(k) 114 k = int(k)
112 print(f"Computing clustering with k={k}") 115 print(f"Computing clustering with k={k}")
113 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 116 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
114 preds = kmeans.predict(X) 117 preds = kmeans.predict(X)
115 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) 118 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
116 119
117 120
118 if __name__ == "__main__": 121 if __name__ == "__main__":
119 # Main parser 122 # Main parser
120 parser = argparse.ArgumentParser(description="Clustering methods to apply") 123 parser = argparse.ArgumentParser(description="Clustering methods to apply")
121 subparsers = parser.add_subparsers(title="action") 124 subparsers = parser.add_subparsers(title="action")
122 125
123 # kmeans 126 # kmeans
124 parser_kmeans = subparsers.add_parser( 127 parser_kmeans = subparsers.add_parser(
125 "kmeans", help="Compute clustering using k-means algorithm") 128 "kmeans", help="Compute clustering using k-means algorithm")
126 129
127 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 130 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
128 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 131 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
129 parser_kmeans.add_argument("-k", default=2, type=int, 132 parser_kmeans.add_argument("-k", default=2, type=int,
130 help="number of clusters to compute. It is kmin if kmax is specified.") 133 help="number of clusters to compute. It is kmin if kmax is specified.")
131 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 134 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
132 parser_kmeans.add_argument("--klist", nargs="+", 135 parser_kmeans.add_argument("--klist", nargs="+",
133 help="List of k values to test. As kmax, activate the multi values mod.") 136 help="List of k values to test. As kmax, activate the multi values mod.")
134 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") 137 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
135 parser_kmeans.set_defaults(which="kmeans") 138 parser_kmeans.set_defaults(which="kmeans")
136 139
137 # measure 140 # measure
138 parser_measure = subparsers.add_parser( 141 parser_measure = subparsers.add_parser(
139 "measure", help="compute the entropy") 142 "measure", help="compute the entropy")
140 143
141 parser_measure.add_argument("--measure", 144 parser_measure.add_argument("--measure",
142 required=True, 145 required=True,
143 nargs="+", 146 nargs="+",
144 choices=[key for key in EVALUATION_METHODS], 147 choices=[key for key in EVALUATION_METHODS],
145 help="...") 148 help="...")
146 parser_measure.add_argument("--features", required=True, type=str, help="...") 149 parser_measure.add_argument("--features", required=True, type=str, help="...")
147 parser_measure.add_argument("--lst", required=True, type=str, help="...") 150 parser_measure.add_argument("--lst", required=True, type=str, help="...")
148 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 151 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
149 parser_measure.add_argument("--model", required=True, type=str, help="...") 152 parser_measure.add_argument("--model", required=True, type=str, help="...")
150 parser_measure.add_argument("--modeltype", 153 parser_measure.add_argument("--modeltype",
151 required=True, 154 required=True,
152 choices=[key for key in CLUSTERING_METHODS], 155 choices=[key for key in CLUSTERING_METHODS],
153 help="type of model for learning") 156 help="type of model for learning")
154 parser_measure.set_defaults(which="measure") 157 parser_measure.set_defaults(which="measure")
155 158
156 # disequilibrium 159 # disequilibrium
157 parser_disequilibrium = subparsers.add_parser( 160 parser_disequilibrium = subparsers.add_parser(
158 "disequilibrium", help="...") 161 "disequilibrium", help="...")
159 162
160 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 163 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
161 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 164 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
162 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 165 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
163 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 166 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
164 parser_disequilibrium.add_argument("--model-type", 167 parser_disequilibrium.add_argument("--model-type",
165 required=True, 168 required=True,
166 choices=["kmeans", "2", "3"], 169 choices=["kmeans", "2", "3"],
167 help="...") 170 help="...")
168 parser_disequilibrium.set_defaults(which="disequilibrium") 171 parser_disequilibrium.set_defaults(which="disequilibrium")
169 172
170 # Parse 173 # Parse
171 args = parser.parse_args() 174 args = parser.parse_args()
172 175
173 # Run commands 176 # Run commands
174 runner = SubCommandRunner({ 177 runner = SubCommandRunner({
175 "kmeans": kmeans_run, 178 "kmeans": kmeans_run,
176 "measure": measure_run, 179 "measure": measure_run,
177 "disequilibrium": disequilibrium_run 180 "disequilibrium": disequilibrium_run
178 }) 181 })
179 182
180 runner.run(args.which, args.__dict__, remove="which") 183 runner.run(args.which, args.__dict__, remove="which")
181 184