Commit 9191399c3b15f017c4a84edeacdb799b490c07e4
1 parent
40650f20d7
Exists in
master
Clustering and evaluation are now availables and we can configure them through global variables.
Showing 3 changed files with 95 additions and 7 deletions Side-by-side Diff
volia/clustering.py
1 | 1 | import argparse |
2 | 2 | from os import path, mkdir |
3 | 3 | from utils import SubCommandRunner |
4 | -from core.data import read_features, read_lst | |
5 | - | |
4 | +from core.data import read_features, read_lst, read_labels | |
6 | 5 | import numpy as np |
7 | 6 | from sklearn.cluster import KMeans |
8 | 7 | import pickle |
8 | +from clustering_modules.kmeans import kmeans | |
9 | 9 | |
10 | +from sklearn.preprocessing import LabelEncoder | |
11 | +from sklearn.metrics import v_measure_score | |
10 | 12 | |
13 | +import core.measures | |
14 | + | |
15 | + | |
16 | +CLUSTERING_METHODS = { | |
17 | + "k-means": kmeans() | |
18 | +} | |
19 | + | |
20 | +EVALUATION_METHODS = { | |
21 | + "entropy": core.measures.entropy_score, | |
22 | + "v-measure": v_measure_score | |
23 | +} | |
24 | + | |
25 | + | |
26 | +def disequilibrium_run(): | |
27 | + pass | |
28 | + | |
29 | + | |
30 | +def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | |
31 | + module = CLUSTERING_METHODS[modeltype] | |
32 | + module.load(model) | |
33 | + evaluation = EVALUATION_METHODS[measure] | |
34 | + feats_dict = read_features(features) | |
35 | + labels_dict = read_labels(truelabels) | |
36 | + lst_dict = read_lst(lst) | |
37 | + lst_keys = [key for key in lst_dict] | |
38 | + feats = np.asarray([feats_dict[key] for key in lst_keys]) | |
39 | + Y_pred = module.predict(feats) | |
40 | + Y_truth = [labels_dict[key][0] for key in lst_keys] | |
41 | + | |
42 | + le = LabelEncoder() | |
43 | + le.fit(Y_truth) | |
44 | + Y_truth = le.transform(Y_truth) | |
45 | + | |
46 | + eval = evaluation(Y_truth, Y_pred) | |
47 | + print(eval) | |
48 | + | |
49 | + | |
50 | + | |
11 | 51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
12 | 52 | """ |
13 | 53 | |
... | ... | @@ -18,7 +58,7 @@ |
18 | 58 | @param klist: list of k values to compute, ignore k value |
19 | 59 | @param output: output file if kmax not specified, else, output directory |
20 | 60 | """ |
21 | - # -- READE FILES -- | |
61 | + # -- READ FILES -- | |
22 | 62 | features_dict = read_features(features) |
23 | 63 | lst_dict = read_lst(lst) |
24 | 64 | X = np.asarray([features_dict[x] for x in lst_dict]) |
25 | 65 | |
... | ... | @@ -79,12 +119,46 @@ |
79 | 119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
80 | 120 | parser_kmeans.set_defaults(which="kmeans") |
81 | 121 | |
122 | + # measure | |
123 | + parser_measure = subparsers.add_parser( | |
124 | + "measure", help="compute the entropy") | |
125 | + | |
126 | + parser_measure.add_argument("--measure", | |
127 | + required=True, | |
128 | + type=str, | |
129 | + choices=[key for key in EVALUATION_METHODS], | |
130 | + help="...") | |
131 | + parser_measure.add_argument("--features", required=True, type=str, help="...") | |
132 | + parser_measure.add_argument("--lst", required=True, type=str, help="...") | |
133 | + parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | |
134 | + parser_measure.add_argument("--model", required=True, type=str, help="...") | |
135 | + parser_measure.add_argument("--modeltype", | |
136 | + required=True, | |
137 | + choices=[key for key in CLUSTERING_METHODS], | |
138 | + help="type of model for learning") | |
139 | + parser_measure.set_defaults(which="measure") | |
140 | + | |
141 | + # disequilibrium | |
142 | + parser_disequilibrium = subparsers.add_parser( | |
143 | + "disequilibrium", help="...") | |
144 | + | |
145 | + parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | |
146 | + parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | |
147 | + parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | |
148 | + parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | |
149 | + parser_disequilibrium.add_argument("--model-type", | |
150 | + required=True, | |
151 | + choices=["kmeans", "2", "3"], | |
152 | + help="...") | |
153 | + | |
82 | 154 | # Parse |
83 | 155 | args = parser.parse_args() |
84 | 156 | |
85 | 157 | # Run commands |
86 | 158 | runner = SubCommandRunner({ |
87 | - "kmeans": kmeans_run | |
159 | + "kmeans": kmeans_run, | |
160 | + "measure": measure_run, | |
161 | + "disequilibrium": disequilibrium_run | |
88 | 162 | }) |
89 | 163 | |
90 | 164 | runner.run(args.which, args.__dict__, remove="which") |
volia/clustering_modules/kmeans.py
1 | + | |
2 | +from sklearn.cluster import KMeans | |
3 | +import pickle | |
4 | +from abstract_clustering import AbstractClustering | |
5 | + | |
6 | +class kmeans(): | |
7 | + def __init__(self): | |
8 | + self.kmeans_model = None | |
9 | + | |
10 | + def predict(self, features): | |
11 | + return self.kmeans_model.predict(features) | |
12 | + | |
13 | + def load(self, model_path): | |
14 | + self.kmeans_model = pickle.load(open(model_path, "rb")) |
volia/core/measures.py
... | ... | @@ -148,7 +148,7 @@ |
148 | 148 | |
149 | 149 | result = result_vector * dividers / dividers.sum() |
150 | 150 | result = result.sum() |
151 | - return (result_matrix, result_vector, result) | |
151 | + return result | |
152 | 152 | |
153 | 153 | |
154 | 154 | def purity_score(y_truth, y_hat): |
... | ... | @@ -181,7 +181,7 @@ |
181 | 181 | vector_purity = np.sum(matrix_divided, axis=axis) |
182 | 182 | |
183 | 183 | scalar_purity = np.average(vector_purity, weights=count_per_row) |
184 | - return (vector_purity, scalar_purity) | |
184 | + return scalar_purity | |
185 | 185 | |
186 | 186 | |
187 | 187 | count_matrix = compute_count_matrix(y_truth, y_hat) |