Commit 9191399c3b15f017c4a84edeacdb799b490c07e4
1 parent
40650f20d7
Exists in
master
Clustering and evaluation are now availables and we can configure them through global variables.
Showing 3 changed files with 95 additions and 7 deletions Side-by-side Diff
volia/clustering.py
| 1 | 1 | import argparse |
| 2 | 2 | from os import path, mkdir |
| 3 | 3 | from utils import SubCommandRunner |
| 4 | -from core.data import read_features, read_lst | |
| 5 | - | |
| 4 | +from core.data import read_features, read_lst, read_labels | |
| 6 | 5 | import numpy as np |
| 7 | 6 | from sklearn.cluster import KMeans |
| 8 | 7 | import pickle |
| 8 | +from clustering_modules.kmeans import kmeans | |
| 9 | 9 | |
| 10 | +from sklearn.preprocessing import LabelEncoder | |
| 11 | +from sklearn.metrics import v_measure_score | |
| 10 | 12 | |
| 13 | +import core.measures | |
| 14 | + | |
| 15 | + | |
| 16 | +CLUSTERING_METHODS = { | |
| 17 | + "k-means": kmeans() | |
| 18 | +} | |
| 19 | + | |
| 20 | +EVALUATION_METHODS = { | |
| 21 | + "entropy": core.measures.entropy_score, | |
| 22 | + "v-measure": v_measure_score | |
| 23 | +} | |
| 24 | + | |
| 25 | + | |
| 26 | +def disequilibrium_run(): | |
| 27 | + pass | |
| 28 | + | |
| 29 | + | |
| 30 | +def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | |
| 31 | + module = CLUSTERING_METHODS[modeltype] | |
| 32 | + module.load(model) | |
| 33 | + evaluation = EVALUATION_METHODS[measure] | |
| 34 | + feats_dict = read_features(features) | |
| 35 | + labels_dict = read_labels(truelabels) | |
| 36 | + lst_dict = read_lst(lst) | |
| 37 | + lst_keys = [key for key in lst_dict] | |
| 38 | + feats = np.asarray([feats_dict[key] for key in lst_keys]) | |
| 39 | + Y_pred = module.predict(feats) | |
| 40 | + Y_truth = [labels_dict[key][0] for key in lst_keys] | |
| 41 | + | |
| 42 | + le = LabelEncoder() | |
| 43 | + le.fit(Y_truth) | |
| 44 | + Y_truth = le.transform(Y_truth) | |
| 45 | + | |
| 46 | + eval = evaluation(Y_truth, Y_pred) | |
| 47 | + print(eval) | |
| 48 | + | |
| 49 | + | |
| 50 | + | |
| 11 | 51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
| 12 | 52 | """ |
| 13 | 53 | |
| ... | ... | @@ -18,7 +58,7 @@ |
| 18 | 58 | @param klist: list of k values to compute, ignore k value |
| 19 | 59 | @param output: output file if kmax not specified, else, output directory |
| 20 | 60 | """ |
| 21 | - # -- READE FILES -- | |
| 61 | + # -- READ FILES -- | |
| 22 | 62 | features_dict = read_features(features) |
| 23 | 63 | lst_dict = read_lst(lst) |
| 24 | 64 | X = np.asarray([features_dict[x] for x in lst_dict]) |
| 25 | 65 | |
| ... | ... | @@ -79,12 +119,46 @@ |
| 79 | 119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
| 80 | 120 | parser_kmeans.set_defaults(which="kmeans") |
| 81 | 121 | |
| 122 | + # measure | |
| 123 | + parser_measure = subparsers.add_parser( | |
| 124 | + "measure", help="compute the entropy") | |
| 125 | + | |
| 126 | + parser_measure.add_argument("--measure", | |
| 127 | + required=True, | |
| 128 | + type=str, | |
| 129 | + choices=[key for key in EVALUATION_METHODS], | |
| 130 | + help="...") | |
| 131 | + parser_measure.add_argument("--features", required=True, type=str, help="...") | |
| 132 | + parser_measure.add_argument("--lst", required=True, type=str, help="...") | |
| 133 | + parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | |
| 134 | + parser_measure.add_argument("--model", required=True, type=str, help="...") | |
| 135 | + parser_measure.add_argument("--modeltype", | |
| 136 | + required=True, | |
| 137 | + choices=[key for key in CLUSTERING_METHODS], | |
| 138 | + help="type of model for learning") | |
| 139 | + parser_measure.set_defaults(which="measure") | |
| 140 | + | |
| 141 | + # disequilibrium | |
| 142 | + parser_disequilibrium = subparsers.add_parser( | |
| 143 | + "disequilibrium", help="...") | |
| 144 | + | |
| 145 | + parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | |
| 146 | + parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | |
| 147 | + parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | |
| 148 | + parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | |
| 149 | + parser_disequilibrium.add_argument("--model-type", | |
| 150 | + required=True, | |
| 151 | + choices=["kmeans", "2", "3"], | |
| 152 | + help="...") | |
| 153 | + | |
| 82 | 154 | # Parse |
| 83 | 155 | args = parser.parse_args() |
| 84 | 156 | |
| 85 | 157 | # Run commands |
| 86 | 158 | runner = SubCommandRunner({ |
| 87 | - "kmeans": kmeans_run | |
| 159 | + "kmeans": kmeans_run, | |
| 160 | + "measure": measure_run, | |
| 161 | + "disequilibrium": disequilibrium_run | |
| 88 | 162 | }) |
| 89 | 163 | |
| 90 | 164 | runner.run(args.which, args.__dict__, remove="which") |
volia/clustering_modules/kmeans.py
| 1 | + | |
| 2 | +from sklearn.cluster import KMeans | |
| 3 | +import pickle | |
| 4 | +from abstract_clustering import AbstractClustering | |
| 5 | + | |
| 6 | +class kmeans(): | |
| 7 | + def __init__(self): | |
| 8 | + self.kmeans_model = None | |
| 9 | + | |
| 10 | + def predict(self, features): | |
| 11 | + return self.kmeans_model.predict(features) | |
| 12 | + | |
| 13 | + def load(self, model_path): | |
| 14 | + self.kmeans_model = pickle.load(open(model_path, "rb")) |
volia/core/measures.py
| ... | ... | @@ -148,7 +148,7 @@ |
| 148 | 148 | |
| 149 | 149 | result = result_vector * dividers / dividers.sum() |
| 150 | 150 | result = result.sum() |
| 151 | - return (result_matrix, result_vector, result) | |
| 151 | + return result | |
| 152 | 152 | |
| 153 | 153 | |
| 154 | 154 | def purity_score(y_truth, y_hat): |
| ... | ... | @@ -181,7 +181,7 @@ |
| 181 | 181 | vector_purity = np.sum(matrix_divided, axis=axis) |
| 182 | 182 | |
| 183 | 183 | scalar_purity = np.average(vector_purity, weights=count_per_row) |
| 184 | - return (vector_purity, scalar_purity) | |
| 184 | + return scalar_purity | |
| 185 | 185 | |
| 186 | 186 | |
| 187 | 187 | count_matrix = compute_count_matrix(y_truth, y_hat) |