Blame view
volia/clustering.py
7.06 KB
3b960e0f1 Clustering comman... |
1 2 3 |
import argparse from os import path, mkdir from utils import SubCommandRunner |
9191399c3 Clustering and ev... |
4 |
from core.data import read_features, read_lst, read_labels |
3b960e0f1 Clustering comman... |
5 6 7 |
import numpy as np from sklearn.cluster import KMeans import pickle |
9191399c3 Clustering and ev... |
8 |
from clustering_modules.kmeans import kmeans |
4152e83df Addind kmeans mah... |
9 |
from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
9191399c3 Clustering and ev... |
10 11 |
from sklearn.preprocessing import LabelEncoder |
fea9649a7 Add many measures... |
12 |
from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
9191399c3 Clustering and ev... |
13 14 |
import core.measures |
3e2abe83e Multiple output f... |
15 |
import json |
9191399c3 Clustering and ev... |
16 17 18 |
CLUSTERING_METHODS = { |
4152e83df Addind kmeans mah... |
19 20 |
"k-means": kmeans(), "k-means-mahalanobis": kmeansMahalanobis() |
9191399c3 Clustering and ev... |
21 22 23 24 |
} EVALUATION_METHODS = { "entropy": core.measures.entropy_score, |
fea9649a7 Add many measures... |
25 26 27 28 |
"purity": core.measures.purity_score, "v-measure": v_measure_score, "homogeneity": homogeneity_score, "completeness": completeness_score, |
9191399c3 Clustering and ev... |
29 30 31 32 33 34 35 36 |
} def disequilibrium_run(): pass def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
3e2abe83e Multiple output f... |
37 38 39 40 41 42 43 44 45 46 |
""" @param measure: @param features: @param lst: @param truelabels: @param model: @param modeltype: @return: """ |
9191399c3 Clustering and ev... |
47 48 |
module = CLUSTERING_METHODS[modeltype] module.load(model) |
9191399c3 Clustering and ev... |
49 |
|
3e2abe83e Multiple output f... |
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
eval = {} for ms in measure: evaluation = EVALUATION_METHODS[ms] feats_dict = read_features(features) labels_dict = read_labels(truelabels) lst_dict = read_lst(lst) lst_keys = [key for key in lst_dict] feats = np.asarray([feats_dict[key] for key in lst_keys]) Y_pred = module.predict(feats) Y_truth = [labels_dict[key][0] for key in lst_keys] le = LabelEncoder() le.fit(Y_truth) Y_truth = le.transform(Y_truth) eval[ms] = evaluation(Y_truth, Y_pred) |
9191399c3 Clustering and ev... |
66 |
|
3e2abe83e Multiple output f... |
67 |
print(json.dumps(eval)) |
9191399c3 Clustering and ev... |
68 |
|
3b960e0f1 Clustering comman... |
69 |
|
4152e83df Addind kmeans mah... |
70 |
def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): |
3b960e0f1 Clustering comman... |
71 72 73 74 75 76 77 78 |
""" @param features: output features @param lst: list file @param k: k (kmin if kmax specified) @param kmax: maximum k to compute @param klist: list of k values to compute, ignore k value @param output: output file if kmax not specified, else, output directory |
4152e83df Addind kmeans mah... |
79 |
@param mahalanobis: distance option of k-means. |
3b960e0f1 Clustering comman... |
80 |
""" |
9191399c3 Clustering and ev... |
81 |
# -- READ FILES -- |
3b960e0f1 Clustering comman... |
82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
features_dict = read_features(features) lst_dict = read_lst(lst) X = np.asarray([features_dict[x] for x in lst_dict]) # Exception cases if kmax is None and klist is None and path.isdir(output): raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") if (kmax is not None or klist is not None) and path.isfile(output): raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") # Mono value case if kmax is None and klist is None: print(f"Computing clustering with k={k}") |
4152e83df Addind kmeans mah... |
96 97 98 99 100 101 |
model = CLUSTERING_METHODS["k-means"] if mahalanobis: print("Computing with mahalanobis distance") model = CLUSTERING_METHODS["k-means-mahalanobis"] model.fit(X, k) model.save(output) |
3b960e0f1 Clustering comman... |
102 103 104 105 106 107 108 |
# Multi values case with kmax if kmax is not None: if not path.isdir(output): mkdir(output) Ks = range(k, kmax + 1) for i in Ks: |
4152e83df Addind kmeans mah... |
109 110 111 112 113 |
model = CLUSTERING_METHODS["k-means"] if mahalanobis: model = CLUSTERING_METHODS["k-means-mahalanobis"] model.fit(X, i) model.save(path.join(output, "clustering_" + str(i) + ".pkl")) |
3b960e0f1 Clustering comman... |
114 115 116 117 118 119 120 |
# Second multi values case with klist if klist is not None: if not path.isdir(output): mkdir(output) for k in klist: k = int(k) |
4152e83df Addind kmeans mah... |
121 122 123 124 125 126 |
model = CLUSTERING_METHODS["k-means"] if mahalanobis: print("Computing with mahalanobis distance") model = CLUSTERING_METHODS["k-means-mahalanobis"] model.fit(X, k) model.save(path.join(output, "clustering_" + str(k) + ".pkl")) |
3b960e0f1 Clustering comman... |
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
if __name__ == "__main__": # Main parser parser = argparse.ArgumentParser(description="Clustering methods to apply") subparsers = parser.add_subparsers(title="action") # kmeans parser_kmeans = subparsers.add_parser( "kmeans", help="Compute clustering using k-means algorithm") parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") parser_kmeans.add_argument("-k", default=2, type=int, help="number of clusters to compute. It is kmin if kmax is specified.") parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") parser_kmeans.add_argument("--klist", nargs="+", help="List of k values to test. As kmax, activate the multi values mod.") |
4152e83df Addind kmeans mah... |
145 146 147 148 |
parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") parser_kmeans.add_argument("--mahalanobis", action="store_true") |
3b960e0f1 Clustering comman... |
149 |
parser_kmeans.set_defaults(which="kmeans") |
9191399c3 Clustering and ev... |
150 151 152 153 154 155 |
# measure parser_measure = subparsers.add_parser( "measure", help="compute the entropy") parser_measure.add_argument("--measure", required=True, |
3e2abe83e Multiple output f... |
156 |
nargs="+", |
9191399c3 Clustering and ev... |
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
choices=[key for key in EVALUATION_METHODS], help="...") parser_measure.add_argument("--features", required=True, type=str, help="...") parser_measure.add_argument("--lst", required=True, type=str, help="...") parser_measure.add_argument("--truelabels", required=True, type=str, help="...") parser_measure.add_argument("--model", required=True, type=str, help="...") parser_measure.add_argument("--modeltype", required=True, choices=[key for key in CLUSTERING_METHODS], help="type of model for learning") parser_measure.set_defaults(which="measure") # disequilibrium parser_disequilibrium = subparsers.add_parser( "disequilibrium", help="...") parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") parser_disequilibrium.add_argument("--model-type", required=True, choices=["kmeans", "2", "3"], help="...") |
e82889087 Adding default va... |
181 |
parser_disequilibrium.set_defaults(which="disequilibrium") |
9191399c3 Clustering and ev... |
182 |
|
3b960e0f1 Clustering comman... |
183 184 185 186 187 |
# Parse args = parser.parse_args() # Run commands runner = SubCommandRunner({ |
9191399c3 Clustering and ev... |
188 189 190 |
"kmeans": kmeans_run, "measure": measure_run, "disequilibrium": disequilibrium_run |
3b960e0f1 Clustering comman... |
191 |
}) |
9191399c3 Clustering and ev... |
192 |
runner.run(args.which, args.__dict__, remove="which") |