diff --git a/volia/clustering.py b/volia/clustering.py index 7b2359f..8c4a98d 100644 --- a/volia/clustering.py +++ b/volia/clustering.py @@ -1,11 +1,51 @@ import argparse from os import path, mkdir from utils import SubCommandRunner -from core.data import read_features, read_lst - +from core.data import read_features, read_lst, read_labels import numpy as np from sklearn.cluster import KMeans import pickle +from clustering_modules.kmeans import kmeans + +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import v_measure_score + +import core.measures + + +CLUSTERING_METHODS = { + "k-means": kmeans() +} + +EVALUATION_METHODS = { + "entropy": core.measures.entropy_score, + "v-measure": v_measure_score +} + + +def disequilibrium_run(): + pass + + +def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): + module = CLUSTERING_METHODS[modeltype] + module.load(model) + evaluation = EVALUATION_METHODS[measure] + feats_dict = read_features(features) + labels_dict = read_labels(truelabels) + lst_dict = read_lst(lst) + lst_keys = [key for key in lst_dict] + feats = np.asarray([feats_dict[key] for key in lst_keys]) + Y_pred = module.predict(feats) + Y_truth = [labels_dict[key][0] for key in lst_keys] + + le = LabelEncoder() + le.fit(Y_truth) + Y_truth = le.transform(Y_truth) + + eval = evaluation(Y_truth, Y_pred) + print(eval) + def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): @@ -18,7 +58,7 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): @param klist: list of k values to compute, ignore k value @param output: output file if kmax not specified, else, output directory """ - # -- READE FILES -- + # -- READ FILES -- features_dict = read_features(features) lst_dict = read_lst(lst) X = np.asarray([features_dict[x] for x in lst_dict]) @@ -79,12 +119,46 @@ if __name__ == "__main__": parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") parser_kmeans.set_defaults(which="kmeans") + # measure + parser_measure = subparsers.add_parser( + "measure", help="compute the entropy") + + parser_measure.add_argument("--measure", + required=True, + type=str, + choices=[key for key in EVALUATION_METHODS], + help="...") + parser_measure.add_argument("--features", required=True, type=str, help="...") + parser_measure.add_argument("--lst", required=True, type=str, help="...") + parser_measure.add_argument("--truelabels", required=True, type=str, help="...") + parser_measure.add_argument("--model", required=True, type=str, help="...") + parser_measure.add_argument("--modeltype", + required=True, + choices=[key for key in CLUSTERING_METHODS], + help="type of model for learning") + parser_measure.set_defaults(which="measure") + + # disequilibrium + parser_disequilibrium = subparsers.add_parser( + "disequilibrium", help="...") + + parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") + parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") + parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") + parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") + parser_disequilibrium.add_argument("--model-type", + required=True, + choices=["kmeans", "2", "3"], + help="...") + # Parse args = parser.parse_args() # Run commands runner = SubCommandRunner({ - "kmeans": kmeans_run + "kmeans": kmeans_run, + "measure": measure_run, + "disequilibrium": disequilibrium_run }) - runner.run(args.which, args.__dict__, remove="which") \ No newline at end of file + runner.run(args.which, args.__dict__, remove="which") diff --git a/volia/clustering_modules/kmeans.py b/volia/clustering_modules/kmeans.py new file mode 100644 index 0000000..2e58f70 --- /dev/null +++ b/volia/clustering_modules/kmeans.py @@ -0,0 +1,14 @@ + +from sklearn.cluster import KMeans +import pickle +from abstract_clustering import AbstractClustering + +class kmeans(): + def __init__(self): + self.kmeans_model = None + + def predict(self, features): + return self.kmeans_model.predict(features) + + def load(self, model_path): + self.kmeans_model = pickle.load(open(model_path, "rb")) diff --git a/volia/core/measures.py b/volia/core/measures.py index 0ef8967..4e5e94a 100644 --- a/volia/core/measures.py +++ b/volia/core/measures.py @@ -148,7 +148,7 @@ def entropy_score(y_truth, y_hat): result = result_vector * dividers / dividers.sum() result = result.sum() - return (result_matrix, result_vector, result) + return result def purity_score(y_truth, y_hat): @@ -181,7 +181,7 @@ def purity_score(y_truth, y_hat): vector_purity = np.sum(matrix_divided, axis=axis) scalar_purity = np.average(vector_purity, weights=count_per_row) - return (vector_purity, scalar_purity) + return scalar_purity count_matrix = compute_count_matrix(y_truth, y_hat)