Blame view
volia/clustering.py
9.32 KB
3b960e0f1 Clustering comman... |
1 2 3 |
import argparse from os import path, mkdir from utils import SubCommandRunner |
ef499b777 Now we can extrac... |
4 |
from core.data import read_features, read_lst, read_labels, write_line |
3b960e0f1 Clustering comman... |
5 6 7 |
import numpy as np from sklearn.cluster import KMeans import pickle |
9191399c3 Clustering and ev... |
8 |
from clustering_modules.kmeans import kmeans |
4152e83df Addind kmeans mah... |
9 |
from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
9191399c3 Clustering and ev... |
10 11 |
from sklearn.preprocessing import LabelEncoder |
fea9649a7 Add many measures... |
12 |
from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
9191399c3 Clustering and ev... |
13 14 |
import core.measures |
3e2abe83e Multiple output f... |
15 |
import json |
9191399c3 Clustering and ev... |
16 17 18 |
CLUSTERING_METHODS = { |
4152e83df Addind kmeans mah... |
19 |
"k-means": kmeans(), |
4309b4a34 Adding constraine... |
20 21 |
"k-means-mahalanobis": kmeansMahalanobis(), "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) |
9191399c3 Clustering and ev... |
22 23 24 25 |
} EVALUATION_METHODS = { "entropy": core.measures.entropy_score, |
fea9649a7 Add many measures... |
26 27 28 29 |
"purity": core.measures.purity_score, "v-measure": v_measure_score, "homogeneity": homogeneity_score, "completeness": completeness_score, |
9191399c3 Clustering and ev... |
30 31 32 33 34 35 36 37 |
} def disequilibrium_run(): pass def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
3e2abe83e Multiple output f... |
38 39 40 41 42 43 44 45 46 47 |
""" @param measure: @param features: @param lst: @param truelabels: @param model: @param modeltype: @return: """ |
9191399c3 Clustering and ev... |
48 49 |
module = CLUSTERING_METHODS[modeltype] module.load(model) |
9191399c3 Clustering and ev... |
50 |
|
3e2abe83e Multiple output f... |
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
eval = {} for ms in measure: evaluation = EVALUATION_METHODS[ms] feats_dict = read_features(features) labels_dict = read_labels(truelabels) lst_dict = read_lst(lst) lst_keys = [key for key in lst_dict] feats = np.asarray([feats_dict[key] for key in lst_keys]) Y_pred = module.predict(feats) Y_truth = [labels_dict[key][0] for key in lst_keys] le = LabelEncoder() le.fit(Y_truth) Y_truth = le.transform(Y_truth) eval[ms] = evaluation(Y_truth, Y_pred) |
9191399c3 Clustering and ev... |
67 |
|
3e2abe83e Multiple output f... |
68 |
print(json.dumps(eval)) |
9191399c3 Clustering and ev... |
69 |
|
3b960e0f1 Clustering comman... |
70 |
|
ed89325d5 Now, we can give ... |
71 72 73 74 75 76 77 78 79 80 81 |
def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, maxiter: int, ninit: int, output: str, tol: float, debug: bool = False, mahalanobis: str = False): |
3b960e0f1 Clustering comman... |
82 83 84 85 86 87 88 89 |
""" @param features: output features @param lst: list file @param k: k (kmin if kmax specified) @param kmax: maximum k to compute @param klist: list of k values to compute, ignore k value @param output: output file if kmax not specified, else, output directory |
4152e83df Addind kmeans mah... |
90 |
@param mahalanobis: distance option of k-means. |
3b960e0f1 Clustering comman... |
91 |
""" |
660d9960f Adding n init par... |
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
json_content = locals().copy() def fit_model(k: int, output_file): if debug: print(f"Computing clustering with k={k}") model = CLUSTERING_METHODS["k-means"] if mahalanobis: if debug: print("Mahalanobis activated") model = CLUSTERING_METHODS["k-means-mahalanobis"] model.fit(X, k, tol, ninit, maxiter, debug) model.save(output_file) json_content["models"].append({ "model_file": output_file, "k": k, }) json_content["models"] = [] |
9191399c3 Clustering and ev... |
110 |
# -- READ FILES -- |
3b960e0f1 Clustering comman... |
111 112 113 114 115 116 117 118 119 120 121 122 123 |
features_dict = read_features(features) lst_dict = read_lst(lst) X = np.asarray([features_dict[x] for x in lst_dict]) # Exception cases if kmax is None and klist is None and path.isdir(output): raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") if (kmax is not None or klist is not None) and path.isfile(output): raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") # Mono value case if kmax is None and klist is None: |
660d9960f Adding n init par... |
124 |
fit_model(k, output) |
3b960e0f1 Clustering comman... |
125 126 127 128 129 130 131 |
# Multi values case with kmax if kmax is not None: if not path.isdir(output): mkdir(output) Ks = range(k, kmax + 1) for i in Ks: |
660d9960f Adding n init par... |
132 |
fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) |
3b960e0f1 Clustering comman... |
133 134 135 136 137 138 139 |
# Second multi values case with klist if klist is not None: if not path.isdir(output): mkdir(output) for k in klist: k = int(k) |
91758e85f Solve an issue (i... |
140 |
fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) |
660d9960f Adding n init par... |
141 |
|
ef499b777 Now we can extrac... |
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
print(json.dumps(json_content)) def extract_run(features, lst, model, modeltype, outfile): feats_dict = read_features(features) lst_dict = read_lst(lst) lst_keys = [key for key in lst_dict] feats = np.asarray([feats_dict[key] for key in lst_keys]) module = CLUSTERING_METHODS[modeltype] module.load(model) Y_pred = module.predict(feats) with open(outfile, "w") as f: for i, key in enumerate(lst_keys): write_line(key, Y_pred[i], f) json_output = { "outfile": outfile } print(json.dumps(json_output)) |
ed89325d5 Now, we can give ... |
161 |
|
3b960e0f1 Clustering comman... |
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
if __name__ == "__main__": # Main parser parser = argparse.ArgumentParser(description="Clustering methods to apply") subparsers = parser.add_subparsers(title="action") # kmeans parser_kmeans = subparsers.add_parser( "kmeans", help="Compute clustering using k-means algorithm") parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") parser_kmeans.add_argument("-k", default=2, type=int, help="number of clusters to compute. It is kmin if kmax is specified.") parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") parser_kmeans.add_argument("--klist", nargs="+", help="List of k values to test. As kmax, activate the multi values mod.") |
ed89325d5 Now, we can give ... |
179 180 181 182 183 184 185 186 187 188 189 190 191 |
parser_kmeans.add_argument("--maxiter", type=int, default=300, help="Max number of iteration before stoping if not converging") parser_kmeans.add_argument("--ninit", type=int, default=10, help="Number of time the k-means algorithm will be run with different centroid seeds.") parser_kmeans.add_argument("--tol", type=float, default=0.0001, help="Tolerance to finish of distance between centroids and their updates.") parser_kmeans.add_argument("--debug", action="store_true") |
4152e83df Addind kmeans mah... |
192 193 194 195 |
parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") parser_kmeans.add_argument("--mahalanobis", action="store_true") |
3b960e0f1 Clustering comman... |
196 |
parser_kmeans.set_defaults(which="kmeans") |
9191399c3 Clustering and ev... |
197 198 199 200 201 202 |
# measure parser_measure = subparsers.add_parser( "measure", help="compute the entropy") parser_measure.add_argument("--measure", required=True, |
3e2abe83e Multiple output f... |
203 |
nargs="+", |
9191399c3 Clustering and ev... |
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
choices=[key for key in EVALUATION_METHODS], help="...") parser_measure.add_argument("--features", required=True, type=str, help="...") parser_measure.add_argument("--lst", required=True, type=str, help="...") parser_measure.add_argument("--truelabels", required=True, type=str, help="...") parser_measure.add_argument("--model", required=True, type=str, help="...") parser_measure.add_argument("--modeltype", required=True, choices=[key for key in CLUSTERING_METHODS], help="type of model for learning") parser_measure.set_defaults(which="measure") # disequilibrium parser_disequilibrium = subparsers.add_parser( "disequilibrium", help="...") parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") parser_disequilibrium.add_argument("--model-type", required=True, choices=["kmeans", "2", "3"], help="...") |
e82889087 Adding default va... |
228 |
parser_disequilibrium.set_defaults(which="disequilibrium") |
9191399c3 Clustering and ev... |
229 |
|
ef499b777 Now we can extrac... |
230 231 232 233 234 235 236 237 238 239 240 241 242 |
# Extract parser_extract = subparsers.add_parser( "extract", help="extract cluster labels") parser_extract.add_argument("--features", required=True, type=str, help="...") parser_extract.add_argument("--lst", required=True, type=str, help="...") parser_extract.add_argument("--model", required=True, type=str, help="...") parser_extract.add_argument("--modeltype", required=True, choices=[key for key in CLUSTERING_METHODS], help="type of model for learning") parser_extract.add_argument("--outfile", required=True, type=str, help="...") parser_extract.set_defaults(which="extract") |
3b960e0f1 Clustering comman... |
243 244 245 246 247 |
# Parse args = parser.parse_args() # Run commands runner = SubCommandRunner({ |
9191399c3 Clustering and ev... |
248 249 |
"kmeans": kmeans_run, "measure": measure_run, |
ef499b777 Now we can extrac... |
250 251 |
"disequilibrium": disequilibrium_run, "extract": extract_run |
3b960e0f1 Clustering comman... |
252 |
}) |
9191399c3 Clustering and ev... |
253 |
runner.run(args.which, args.__dict__, remove="which") |