Commit e828890879fbcb1b4426543be5a84a0dd31cf82a

Authored by quillotm
1 parent f0ca26aaf4
Exists in master

Adding default value for argparse in the case of disequilibrium choice.

Showing 1 changed file with 1 additions and 0 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 9
10 from sklearn.preprocessing import LabelEncoder 10 from sklearn.preprocessing import LabelEncoder
11 from sklearn.metrics import v_measure_score 11 from sklearn.metrics import v_measure_score
12 12
13 import core.measures 13 import core.measures
14 14
15 15
16 CLUSTERING_METHODS = { 16 CLUSTERING_METHODS = {
17 "k-means": kmeans() 17 "k-means": kmeans()
18 } 18 }
19 19
20 EVALUATION_METHODS = { 20 EVALUATION_METHODS = {
21 "entropy": core.measures.entropy_score, 21 "entropy": core.measures.entropy_score,
22 "v-measure": v_measure_score 22 "v-measure": v_measure_score
23 } 23 }
24 24
25 25
26 def disequilibrium_run(): 26 def disequilibrium_run():
27 pass 27 pass
28 28
29 29
30 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 30 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
31 module = CLUSTERING_METHODS[modeltype] 31 module = CLUSTERING_METHODS[modeltype]
32 module.load(model) 32 module.load(model)
33 evaluation = EVALUATION_METHODS[measure] 33 evaluation = EVALUATION_METHODS[measure]
34 feats_dict = read_features(features) 34 feats_dict = read_features(features)
35 labels_dict = read_labels(truelabels) 35 labels_dict = read_labels(truelabels)
36 lst_dict = read_lst(lst) 36 lst_dict = read_lst(lst)
37 lst_keys = [key for key in lst_dict] 37 lst_keys = [key for key in lst_dict]
38 feats = np.asarray([feats_dict[key] for key in lst_keys]) 38 feats = np.asarray([feats_dict[key] for key in lst_keys])
39 Y_pred = module.predict(feats) 39 Y_pred = module.predict(feats)
40 Y_truth = [labels_dict[key][0] for key in lst_keys] 40 Y_truth = [labels_dict[key][0] for key in lst_keys]
41 41
42 le = LabelEncoder() 42 le = LabelEncoder()
43 le.fit(Y_truth) 43 le.fit(Y_truth)
44 Y_truth = le.transform(Y_truth) 44 Y_truth = le.transform(Y_truth)
45 45
46 eval = evaluation(Y_truth, Y_pred) 46 eval = evaluation(Y_truth, Y_pred)
47 print(eval) 47 print(eval)
48 48
49 49
50 50
51 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): 51 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
52 """ 52 """
53 53
54 @param features: output features 54 @param features: output features
55 @param lst: list file 55 @param lst: list file
56 @param k: k (kmin if kmax specified) 56 @param k: k (kmin if kmax specified)
57 @param kmax: maximum k to compute 57 @param kmax: maximum k to compute
58 @param klist: list of k values to compute, ignore k value 58 @param klist: list of k values to compute, ignore k value
59 @param output: output file if kmax not specified, else, output directory 59 @param output: output file if kmax not specified, else, output directory
60 """ 60 """
61 # -- READ FILES -- 61 # -- READ FILES --
62 features_dict = read_features(features) 62 features_dict = read_features(features)
63 lst_dict = read_lst(lst) 63 lst_dict = read_lst(lst)
64 X = np.asarray([features_dict[x] for x in lst_dict]) 64 X = np.asarray([features_dict[x] for x in lst_dict])
65 65
66 # Exception cases 66 # Exception cases
67 if kmax is None and klist is None and path.isdir(output): 67 if kmax is None and klist is None and path.isdir(output):
68 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 68 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
69 69
70 if (kmax is not None or klist is not None) and path.isfile(output): 70 if (kmax is not None or klist is not None) and path.isfile(output):
71 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 71 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
72 72
73 # Mono value case 73 # Mono value case
74 if kmax is None and klist is None: 74 if kmax is None and klist is None:
75 print(f"Computing clustering with k={k}") 75 print(f"Computing clustering with k={k}")
76 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 76 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
77 preds = kmeans.predict(X) 77 preds = kmeans.predict(X)
78 pickle.dump(kmeans, open(output, "wb")) 78 pickle.dump(kmeans, open(output, "wb"))
79 79
80 # Multi values case with kmax 80 # Multi values case with kmax
81 if kmax is not None: 81 if kmax is not None:
82 if not path.isdir(output): 82 if not path.isdir(output):
83 mkdir(output) 83 mkdir(output)
84 Ks = range(k, kmax + 1) 84 Ks = range(k, kmax + 1)
85 for i in Ks: 85 for i in Ks:
86 print(f"Computing clustering with k={i}") 86 print(f"Computing clustering with k={i}")
87 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) 87 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
88 preds = kmeans.predict(X) 88 preds = kmeans.predict(X)
89 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) 89 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
90 90
91 # Second multi values case with klist 91 # Second multi values case with klist
92 if klist is not None: 92 if klist is not None:
93 if not path.isdir(output): 93 if not path.isdir(output):
94 mkdir(output) 94 mkdir(output)
95 for k in klist: 95 for k in klist:
96 k = int(k) 96 k = int(k)
97 print(f"Computing clustering with k={k}") 97 print(f"Computing clustering with k={k}")
98 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 98 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
99 preds = kmeans.predict(X) 99 preds = kmeans.predict(X)
100 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) 100 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
101 101
102 102
103 if __name__ == "__main__": 103 if __name__ == "__main__":
104 # Main parser 104 # Main parser
105 parser = argparse.ArgumentParser(description="Clustering methods to apply") 105 parser = argparse.ArgumentParser(description="Clustering methods to apply")
106 subparsers = parser.add_subparsers(title="action") 106 subparsers = parser.add_subparsers(title="action")
107 107
108 # kmeans 108 # kmeans
109 parser_kmeans = subparsers.add_parser( 109 parser_kmeans = subparsers.add_parser(
110 "kmeans", help="Compute clustering using k-means algorithm") 110 "kmeans", help="Compute clustering using k-means algorithm")
111 111
112 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 112 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
113 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 113 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
114 parser_kmeans.add_argument("-k", default=2, type=int, 114 parser_kmeans.add_argument("-k", default=2, type=int,
115 help="number of clusters to compute. It is kmin if kmax is specified.") 115 help="number of clusters to compute. It is kmin if kmax is specified.")
116 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 116 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
117 parser_kmeans.add_argument("--klist", nargs="+", 117 parser_kmeans.add_argument("--klist", nargs="+",
118 help="List of k values to test. As kmax, activate the multi values mod.") 118 help="List of k values to test. As kmax, activate the multi values mod.")
119 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") 119 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
120 parser_kmeans.set_defaults(which="kmeans") 120 parser_kmeans.set_defaults(which="kmeans")
121 121
122 # measure 122 # measure
123 parser_measure = subparsers.add_parser( 123 parser_measure = subparsers.add_parser(
124 "measure", help="compute the entropy") 124 "measure", help="compute the entropy")
125 125
126 parser_measure.add_argument("--measure", 126 parser_measure.add_argument("--measure",
127 required=True, 127 required=True,
128 type=str, 128 type=str,
129 choices=[key for key in EVALUATION_METHODS], 129 choices=[key for key in EVALUATION_METHODS],
130 help="...") 130 help="...")
131 parser_measure.add_argument("--features", required=True, type=str, help="...") 131 parser_measure.add_argument("--features", required=True, type=str, help="...")
132 parser_measure.add_argument("--lst", required=True, type=str, help="...") 132 parser_measure.add_argument("--lst", required=True, type=str, help="...")
133 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 133 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
134 parser_measure.add_argument("--model", required=True, type=str, help="...") 134 parser_measure.add_argument("--model", required=True, type=str, help="...")
135 parser_measure.add_argument("--modeltype", 135 parser_measure.add_argument("--modeltype",
136 required=True, 136 required=True,
137 choices=[key for key in CLUSTERING_METHODS], 137 choices=[key for key in CLUSTERING_METHODS],
138 help="type of model for learning") 138 help="type of model for learning")
139 parser_measure.set_defaults(which="measure") 139 parser_measure.set_defaults(which="measure")
140 140
141 # disequilibrium 141 # disequilibrium
142 parser_disequilibrium = subparsers.add_parser( 142 parser_disequilibrium = subparsers.add_parser(
143 "disequilibrium", help="...") 143 "disequilibrium", help="...")
144 144
145 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 145 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
146 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 146 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
147 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 147 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
148 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 148 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
149 parser_disequilibrium.add_argument("--model-type", 149 parser_disequilibrium.add_argument("--model-type",
150 required=True, 150 required=True,
151 choices=["kmeans", "2", "3"], 151 choices=["kmeans", "2", "3"],
152 help="...") 152 help="...")
153 parser_disequilibrium.set_defaults(which="disequilibrium")
153 154
154 # Parse 155 # Parse
155 args = parser.parse_args() 156 args = parser.parse_args()
156 157
157 # Run commands 158 # Run commands
158 runner = SubCommandRunner({ 159 runner = SubCommandRunner({
159 "kmeans": kmeans_run, 160 "kmeans": kmeans_run,
160 "measure": measure_run, 161 "measure": measure_run,
161 "disequilibrium": disequilibrium_run 162 "disequilibrium": disequilibrium_run
162 }) 163 })
163 164
164 runner.run(args.which, args.__dict__, remove="which") 165 runner.run(args.which, args.__dict__, remove="which")
165 166