Commit 9191399c3b15f017c4a84edeacdb799b490c07e4

Authored by quillotm
1 parent 40650f20d7
Exists in master

Clustering and evaluation are now availables and we can configure them through global variables.

Showing 3 changed files with 95 additions and 7 deletions Side-by-side Diff

1 1 import argparse
2 2 from os import path, mkdir
3 3 from utils import SubCommandRunner
4   -from core.data import read_features, read_lst
5   -
  4 +from core.data import read_features, read_lst, read_labels
6 5 import numpy as np
7 6 from sklearn.cluster import KMeans
8 7 import pickle
  8 +from clustering_modules.kmeans import kmeans
9 9  
  10 +from sklearn.preprocessing import LabelEncoder
  11 +from sklearn.metrics import v_measure_score
10 12  
  13 +import core.measures
  14 +
  15 +
  16 +CLUSTERING_METHODS = {
  17 + "k-means": kmeans()
  18 +}
  19 +
  20 +EVALUATION_METHODS = {
  21 + "entropy": core.measures.entropy_score,
  22 + "v-measure": v_measure_score
  23 +}
  24 +
  25 +
  26 +def disequilibrium_run():
  27 + pass
  28 +
  29 +
  30 +def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
  31 + module = CLUSTERING_METHODS[modeltype]
  32 + module.load(model)
  33 + evaluation = EVALUATION_METHODS[measure]
  34 + feats_dict = read_features(features)
  35 + labels_dict = read_labels(truelabels)
  36 + lst_dict = read_lst(lst)
  37 + lst_keys = [key for key in lst_dict]
  38 + feats = np.asarray([feats_dict[key] for key in lst_keys])
  39 + Y_pred = module.predict(feats)
  40 + Y_truth = [labels_dict[key][0] for key in lst_keys]
  41 +
  42 + le = LabelEncoder()
  43 + le.fit(Y_truth)
  44 + Y_truth = le.transform(Y_truth)
  45 +
  46 + eval = evaluation(Y_truth, Y_pred)
  47 + print(eval)
  48 +
  49 +
  50 +
11 51 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
12 52 """
13 53  
... ... @@ -18,7 +58,7 @@
18 58 @param klist: list of k values to compute, ignore k value
19 59 @param output: output file if kmax not specified, else, output directory
20 60 """
21   - # -- READE FILES --
  61 + # -- READ FILES --
22 62 features_dict = read_features(features)
23 63 lst_dict = read_lst(lst)
24 64 X = np.asarray([features_dict[x] for x in lst_dict])
25 65  
... ... @@ -79,12 +119,46 @@
79 119 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
80 120 parser_kmeans.set_defaults(which="kmeans")
81 121  
  122 + # measure
  123 + parser_measure = subparsers.add_parser(
  124 + "measure", help="compute the entropy")
  125 +
  126 + parser_measure.add_argument("--measure",
  127 + required=True,
  128 + type=str,
  129 + choices=[key for key in EVALUATION_METHODS],
  130 + help="...")
  131 + parser_measure.add_argument("--features", required=True, type=str, help="...")
  132 + parser_measure.add_argument("--lst", required=True, type=str, help="...")
  133 + parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
  134 + parser_measure.add_argument("--model", required=True, type=str, help="...")
  135 + parser_measure.add_argument("--modeltype",
  136 + required=True,
  137 + choices=[key for key in CLUSTERING_METHODS],
  138 + help="type of model for learning")
  139 + parser_measure.set_defaults(which="measure")
  140 +
  141 + # disequilibrium
  142 + parser_disequilibrium = subparsers.add_parser(
  143 + "disequilibrium", help="...")
  144 +
  145 + parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
  146 + parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
  147 + parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
  148 + parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
  149 + parser_disequilibrium.add_argument("--model-type",
  150 + required=True,
  151 + choices=["kmeans", "2", "3"],
  152 + help="...")
  153 +
82 154 # Parse
83 155 args = parser.parse_args()
84 156  
85 157 # Run commands
86 158 runner = SubCommandRunner({
87   - "kmeans": kmeans_run
  159 + "kmeans": kmeans_run,
  160 + "measure": measure_run,
  161 + "disequilibrium": disequilibrium_run
88 162 })
89 163  
90 164 runner.run(args.which, args.__dict__, remove="which")
volia/clustering_modules/kmeans.py
  1 +
  2 +from sklearn.cluster import KMeans
  3 +import pickle
  4 +from abstract_clustering import AbstractClustering
  5 +
  6 +class kmeans():
  7 + def __init__(self):
  8 + self.kmeans_model = None
  9 +
  10 + def predict(self, features):
  11 + return self.kmeans_model.predict(features)
  12 +
  13 + def load(self, model_path):
  14 + self.kmeans_model = pickle.load(open(model_path, "rb"))
volia/core/measures.py
... ... @@ -148,7 +148,7 @@
148 148  
149 149 result = result_vector * dividers / dividers.sum()
150 150 result = result.sum()
151   - return (result_matrix, result_vector, result)
  151 + return result
152 152  
153 153  
154 154 def purity_score(y_truth, y_hat):
... ... @@ -181,7 +181,7 @@
181 181 vector_purity = np.sum(matrix_divided, axis=axis)
182 182  
183 183 scalar_purity = np.average(vector_purity, weights=count_per_row)
184   - return (vector_purity, scalar_purity)
  184 + return scalar_purity
185 185  
186 186  
187 187 count_matrix = compute_count_matrix(y_truth, y_hat)