Commit 2edadabee352868100d1829adb0c1dfb9eb6b4ad
1 parent
55bcf758f3
Exists in
master
By default, kmeans mahalanobis is with constrains
Showing 1 changed file with 1 additions and 1 deletions Inline Diff
volia/clustering.py
| 1 | import argparse | 1 | import argparse |
| 2 | from os import path, mkdir | 2 | from os import path, mkdir |
| 3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
| 4 | from core.data import read_features, read_lst, read_labels, write_line | 4 | from core.data import read_features, read_lst, read_labels, write_line |
| 5 | import numpy as np | 5 | import numpy as np |
| 6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
| 7 | import pickle | 7 | import pickle |
| 8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
| 9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | 9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
| 10 | 10 | ||
| 11 | from sklearn.preprocessing import LabelEncoder | 11 | from sklearn.preprocessing import LabelEncoder |
| 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score | 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
| 13 | 13 | ||
| 14 | import core.measures | 14 | import core.measures |
| 15 | import json | 15 | import json |
| 16 | 16 | ||
| 17 | 17 | ||
| 18 | CLUSTERING_METHODS = { | 18 | CLUSTERING_METHODS = { |
| 19 | "k-means": kmeans(), | 19 | "k-means": kmeans(), |
| 20 | "k-means-mahalanobis": kmeansMahalanobis(), | 20 | "k-means-mahalanobis": kmeansMahalanobis(), |
| 21 | "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) | 21 | "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) |
| 22 | } | 22 | } |
| 23 | 23 | ||
| 24 | EVALUATION_METHODS = { | 24 | EVALUATION_METHODS = { |
| 25 | "entropy": core.measures.entropy_score, | 25 | "entropy": core.measures.entropy_score, |
| 26 | "purity": core.measures.purity_score, | 26 | "purity": core.measures.purity_score, |
| 27 | "v-measure": v_measure_score, | 27 | "v-measure": v_measure_score, |
| 28 | "homogeneity": homogeneity_score, | 28 | "homogeneity": homogeneity_score, |
| 29 | "completeness": completeness_score, | 29 | "completeness": completeness_score, |
| 30 | } | 30 | } |
| 31 | 31 | ||
| 32 | 32 | ||
| 33 | def disequilibrium_run(): | 33 | def disequilibrium_run(): |
| 34 | pass | 34 | pass |
| 35 | 35 | ||
| 36 | 36 | ||
| 37 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 37 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
| 38 | """ | 38 | """ |
| 39 | 39 | ||
| 40 | @param measure: | 40 | @param measure: |
| 41 | @param features: | 41 | @param features: |
| 42 | @param lst: | 42 | @param lst: |
| 43 | @param truelabels: | 43 | @param truelabels: |
| 44 | @param model: | 44 | @param model: |
| 45 | @param modeltype: | 45 | @param modeltype: |
| 46 | @return: | 46 | @return: |
| 47 | """ | 47 | """ |
| 48 | module = CLUSTERING_METHODS[modeltype] | 48 | module = CLUSTERING_METHODS[modeltype] |
| 49 | module.load(model) | 49 | module.load(model) |
| 50 | 50 | ||
| 51 | eval = {} | 51 | eval = {} |
| 52 | for ms in measure: | 52 | for ms in measure: |
| 53 | evaluation = EVALUATION_METHODS[ms] | 53 | evaluation = EVALUATION_METHODS[ms] |
| 54 | feats_dict = read_features(features) | 54 | feats_dict = read_features(features) |
| 55 | labels_dict = read_labels(truelabels) | 55 | labels_dict = read_labels(truelabels) |
| 56 | lst_dict = read_lst(lst) | 56 | lst_dict = read_lst(lst) |
| 57 | lst_keys = [key for key in lst_dict] | 57 | lst_keys = [key for key in lst_dict] |
| 58 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 58 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
| 59 | Y_pred = module.predict(feats) | 59 | Y_pred = module.predict(feats) |
| 60 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 60 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
| 61 | 61 | ||
| 62 | le = LabelEncoder() | 62 | le = LabelEncoder() |
| 63 | le.fit(Y_truth) | 63 | le.fit(Y_truth) |
| 64 | Y_truth = le.transform(Y_truth) | 64 | Y_truth = le.transform(Y_truth) |
| 65 | 65 | ||
| 66 | eval[ms] = evaluation(Y_truth, Y_pred) | 66 | eval[ms] = evaluation(Y_truth, Y_pred) |
| 67 | 67 | ||
| 68 | print(json.dumps(eval)) | 68 | print(json.dumps(eval)) |
| 69 | 69 | ||
| 70 | 70 | ||
| 71 | def kmeans_run(features: str, | 71 | def kmeans_run(features: str, |
| 72 | lst: str, | 72 | lst: str, |
| 73 | k:int, | 73 | k:int, |
| 74 | kmax: int, | 74 | kmax: int, |
| 75 | klist, | 75 | klist, |
| 76 | maxiter: int, | 76 | maxiter: int, |
| 77 | ninit: int, | 77 | ninit: int, |
| 78 | output: str, | 78 | output: str, |
| 79 | tol: float, | 79 | tol: float, |
| 80 | debug: bool = False, | 80 | debug: bool = False, |
| 81 | mahalanobis: str = False): | 81 | mahalanobis: str = False): |
| 82 | """ | 82 | """ |
| 83 | 83 | ||
| 84 | @param features: output features | 84 | @param features: output features |
| 85 | @param lst: list file | 85 | @param lst: list file |
| 86 | @param k: k (kmin if kmax specified) | 86 | @param k: k (kmin if kmax specified) |
| 87 | @param kmax: maximum k to compute | 87 | @param kmax: maximum k to compute |
| 88 | @param klist: list of k values to compute, ignore k value | 88 | @param klist: list of k values to compute, ignore k value |
| 89 | @param output: output file if kmax not specified, else, output directory | 89 | @param output: output file if kmax not specified, else, output directory |
| 90 | @param mahalanobis: distance option of k-means. | 90 | @param mahalanobis: distance option of k-means. |
| 91 | """ | 91 | """ |
| 92 | json_content = locals().copy() | 92 | json_content = locals().copy() |
| 93 | 93 | ||
| 94 | def fit_model(k: int, output_file): | 94 | def fit_model(k: int, output_file): |
| 95 | if debug: | 95 | if debug: |
| 96 | print(f"Computing clustering with k={k}") | 96 | print(f"Computing clustering with k={k}") |
| 97 | model = CLUSTERING_METHODS["k-means"] | 97 | model = CLUSTERING_METHODS["k-means"] |
| 98 | if mahalanobis: | 98 | if mahalanobis: |
| 99 | if debug: | 99 | if debug: |
| 100 | print("Mahalanobis activated") | 100 | print("Mahalanobis activated") |
| 101 | model = CLUSTERING_METHODS["k-means-mahalanobis"] | 101 | model = CLUSTERING_METHODS["k-means-mahalanobis-constrained"] |
| 102 | model.fit(X, k, tol, ninit, maxiter, debug) | 102 | model.fit(X, k, tol, ninit, maxiter, debug) |
| 103 | model.save(output_file) | 103 | model.save(output_file) |
| 104 | json_content["models"].append({ | 104 | json_content["models"].append({ |
| 105 | "model_file": output_file, | 105 | "model_file": output_file, |
| 106 | "k": k, | 106 | "k": k, |
| 107 | }) | 107 | }) |
| 108 | 108 | ||
| 109 | json_content["models"] = [] | 109 | json_content["models"] = [] |
| 110 | 110 | ||
| 111 | # -- READ FILES -- | 111 | # -- READ FILES -- |
| 112 | features_dict = read_features(features) | 112 | features_dict = read_features(features) |
| 113 | lst_dict = read_lst(lst) | 113 | lst_dict = read_lst(lst) |
| 114 | X = np.asarray([features_dict[x] for x in lst_dict]) | 114 | X = np.asarray([features_dict[x] for x in lst_dict]) |
| 115 | 115 | ||
| 116 | # Exception cases | 116 | # Exception cases |
| 117 | if kmax is None and klist is None and path.isdir(output): | 117 | if kmax is None and klist is None and path.isdir(output): |
| 118 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 118 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
| 119 | 119 | ||
| 120 | if (kmax is not None or klist is not None) and path.isfile(output): | 120 | if (kmax is not None or klist is not None) and path.isfile(output): |
| 121 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 121 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
| 122 | 122 | ||
| 123 | # Mono value case | 123 | # Mono value case |
| 124 | if kmax is None and klist is None: | 124 | if kmax is None and klist is None: |
| 125 | fit_model(k, output) | 125 | fit_model(k, output) |
| 126 | 126 | ||
| 127 | # Multi values case with kmax | 127 | # Multi values case with kmax |
| 128 | if kmax is not None: | 128 | if kmax is not None: |
| 129 | if not path.isdir(output): | 129 | if not path.isdir(output): |
| 130 | mkdir(output) | 130 | mkdir(output) |
| 131 | Ks = range(k, kmax + 1) | 131 | Ks = range(k, kmax + 1) |
| 132 | for i in Ks: | 132 | for i in Ks: |
| 133 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) | 133 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) |
| 134 | 134 | ||
| 135 | # Second multi values case with klist | 135 | # Second multi values case with klist |
| 136 | if klist is not None: | 136 | if klist is not None: |
| 137 | if not path.isdir(output): | 137 | if not path.isdir(output): |
| 138 | mkdir(output) | 138 | mkdir(output) |
| 139 | for k in klist: | 139 | for k in klist: |
| 140 | k = int(k) | 140 | k = int(k) |
| 141 | fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) | 141 | fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl")) |
| 142 | 142 | ||
| 143 | print(json.dumps(json_content)) | 143 | print(json.dumps(json_content)) |
| 144 | 144 | ||
| 145 | 145 | ||
| 146 | def extract_run(features, lst, model, modeltype, outfile): | 146 | def extract_run(features, lst, model, modeltype, outfile): |
| 147 | feats_dict = read_features(features) | 147 | feats_dict = read_features(features) |
| 148 | lst_dict = read_lst(lst) | 148 | lst_dict = read_lst(lst) |
| 149 | lst_keys = [key for key in lst_dict] | 149 | lst_keys = [key for key in lst_dict] |
| 150 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 150 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
| 151 | 151 | ||
| 152 | module = CLUSTERING_METHODS[modeltype] | 152 | module = CLUSTERING_METHODS[modeltype] |
| 153 | module.load(model) | 153 | module.load(model) |
| 154 | Y_pred = module.predict(feats) | 154 | Y_pred = module.predict(feats) |
| 155 | with open(outfile, "w") as f: | 155 | with open(outfile, "w") as f: |
| 156 | for i, key in enumerate(lst_keys): | 156 | for i, key in enumerate(lst_keys): |
| 157 | write_line(key, Y_pred[i], f) | 157 | write_line(key, Y_pred[i], f) |
| 158 | json_output = { | 158 | json_output = { |
| 159 | "outfile": outfile | 159 | "outfile": outfile |
| 160 | } | 160 | } |
| 161 | print(json.dumps(json_output)) | 161 | print(json.dumps(json_output)) |
| 162 | 162 | ||
| 163 | 163 | ||
| 164 | if __name__ == "__main__": | 164 | if __name__ == "__main__": |
| 165 | # Main parser | 165 | # Main parser |
| 166 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 166 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
| 167 | subparsers = parser.add_subparsers(title="action") | 167 | subparsers = parser.add_subparsers(title="action") |
| 168 | 168 | ||
| 169 | # kmeans | 169 | # kmeans |
| 170 | parser_kmeans = subparsers.add_parser( | 170 | parser_kmeans = subparsers.add_parser( |
| 171 | "kmeans", help="Compute clustering using k-means algorithm") | 171 | "kmeans", help="Compute clustering using k-means algorithm") |
| 172 | 172 | ||
| 173 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 173 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
| 174 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 174 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
| 175 | parser_kmeans.add_argument("-k", default=2, type=int, | 175 | parser_kmeans.add_argument("-k", default=2, type=int, |
| 176 | help="number of clusters to compute. It is kmin if kmax is specified.") | 176 | help="number of clusters to compute. It is kmin if kmax is specified.") |
| 177 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 177 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
| 178 | parser_kmeans.add_argument("--klist", nargs="+", | 178 | parser_kmeans.add_argument("--klist", nargs="+", |
| 179 | help="List of k values to test. As kmax, activate the multi values mod.") | 179 | help="List of k values to test. As kmax, activate the multi values mod.") |
| 180 | parser_kmeans.add_argument("--maxiter", | 180 | parser_kmeans.add_argument("--maxiter", |
| 181 | type=int, | 181 | type=int, |
| 182 | default=300, | 182 | default=300, |
| 183 | help="Max number of iteration before stoping if not converging") | 183 | help="Max number of iteration before stoping if not converging") |
| 184 | parser_kmeans.add_argument("--ninit", | 184 | parser_kmeans.add_argument("--ninit", |
| 185 | type=int, | 185 | type=int, |
| 186 | default=10, | 186 | default=10, |
| 187 | help="Number of time the k-means algorithm will be run with different centroid seeds.") | 187 | help="Number of time the k-means algorithm will be run with different centroid seeds.") |
| 188 | parser_kmeans.add_argument("--tol", | 188 | parser_kmeans.add_argument("--tol", |
| 189 | type=float, | 189 | type=float, |
| 190 | default=0.0001, | 190 | default=0.0001, |
| 191 | help="Tolerance to finish of distance between centroids and their updates.") | 191 | help="Tolerance to finish of distance between centroids and their updates.") |
| 192 | parser_kmeans.add_argument("--debug", action="store_true") | 192 | parser_kmeans.add_argument("--debug", action="store_true") |
| 193 | parser_kmeans.add_argument("--output", | 193 | parser_kmeans.add_argument("--output", |
| 194 | default=".kmeans", | 194 | default=".kmeans", |
| 195 | help="output file if only k. Output directory if multiple kmax specified.") | 195 | help="output file if only k. Output directory if multiple kmax specified.") |
| 196 | parser_kmeans.add_argument("--mahalanobis", action="store_true") | 196 | parser_kmeans.add_argument("--mahalanobis", action="store_true") |
| 197 | parser_kmeans.set_defaults(which="kmeans") | 197 | parser_kmeans.set_defaults(which="kmeans") |
| 198 | 198 | ||
| 199 | # measure | 199 | # measure |
| 200 | parser_measure = subparsers.add_parser( | 200 | parser_measure = subparsers.add_parser( |
| 201 | "measure", help="compute the entropy") | 201 | "measure", help="compute the entropy") |
| 202 | 202 | ||
| 203 | parser_measure.add_argument("--measure", | 203 | parser_measure.add_argument("--measure", |
| 204 | required=True, | 204 | required=True, |
| 205 | nargs="+", | 205 | nargs="+", |
| 206 | choices=[key for key in EVALUATION_METHODS], | 206 | choices=[key for key in EVALUATION_METHODS], |
| 207 | help="...") | 207 | help="...") |
| 208 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 208 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
| 209 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 209 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
| 210 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 210 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
| 211 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 211 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
| 212 | parser_measure.add_argument("--modeltype", | 212 | parser_measure.add_argument("--modeltype", |
| 213 | required=True, | 213 | required=True, |
| 214 | choices=[key for key in CLUSTERING_METHODS], | 214 | choices=[key for key in CLUSTERING_METHODS], |
| 215 | help="type of model for learning") | 215 | help="type of model for learning") |
| 216 | parser_measure.set_defaults(which="measure") | 216 | parser_measure.set_defaults(which="measure") |
| 217 | 217 | ||
| 218 | # disequilibrium | 218 | # disequilibrium |
| 219 | parser_disequilibrium = subparsers.add_parser( | 219 | parser_disequilibrium = subparsers.add_parser( |
| 220 | "disequilibrium", help="...") | 220 | "disequilibrium", help="...") |
| 221 | 221 | ||
| 222 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 222 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
| 223 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 223 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
| 224 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 224 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
| 225 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 225 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
| 226 | parser_disequilibrium.add_argument("--model-type", | 226 | parser_disequilibrium.add_argument("--model-type", |
| 227 | required=True, | 227 | required=True, |
| 228 | choices=["kmeans", "2", "3"], | 228 | choices=["kmeans", "2", "3"], |
| 229 | help="...") | 229 | help="...") |
| 230 | parser_disequilibrium.set_defaults(which="disequilibrium") | 230 | parser_disequilibrium.set_defaults(which="disequilibrium") |
| 231 | 231 | ||
| 232 | # Extract | 232 | # Extract |
| 233 | parser_extract = subparsers.add_parser( | 233 | parser_extract = subparsers.add_parser( |
| 234 | "extract", help="extract cluster labels") | 234 | "extract", help="extract cluster labels") |
| 235 | 235 | ||
| 236 | parser_extract.add_argument("--features", required=True, type=str, help="...") | 236 | parser_extract.add_argument("--features", required=True, type=str, help="...") |
| 237 | parser_extract.add_argument("--lst", required=True, type=str, help="...") | 237 | parser_extract.add_argument("--lst", required=True, type=str, help="...") |
| 238 | parser_extract.add_argument("--model", required=True, type=str, help="...") | 238 | parser_extract.add_argument("--model", required=True, type=str, help="...") |
| 239 | parser_extract.add_argument("--modeltype", | 239 | parser_extract.add_argument("--modeltype", |
| 240 | required=True, | 240 | required=True, |
| 241 | choices=[key for key in CLUSTERING_METHODS], | 241 | choices=[key for key in CLUSTERING_METHODS], |
| 242 | help="type of model for learning") | 242 | help="type of model for learning") |
| 243 | parser_extract.add_argument("--outfile", required=True, type=str, help="...") | 243 | parser_extract.add_argument("--outfile", required=True, type=str, help="...") |
| 244 | parser_extract.set_defaults(which="extract") | 244 | parser_extract.set_defaults(which="extract") |
| 245 | 245 | ||
| 246 | # Parse | 246 | # Parse |
| 247 | args = parser.parse_args() | 247 | args = parser.parse_args() |
| 248 | 248 | ||
| 249 | # Run commands | 249 | # Run commands |
| 250 | runner = SubCommandRunner({ | 250 | runner = SubCommandRunner({ |
| 251 | "kmeans": kmeans_run, | 251 | "kmeans": kmeans_run, |
| 252 | "measure": measure_run, | 252 | "measure": measure_run, |
| 253 | "disequilibrium": disequilibrium_run, | 253 | "disequilibrium": disequilibrium_run, |
| 254 | "extract": extract_run | 254 | "extract": extract_run |
| 255 | }) | 255 | }) |
| 256 | 256 | ||
| 257 | runner.run(args.which, args.__dict__, remove="which") | 257 | runner.run(args.which, args.__dict__, remove="which") |
| 258 | 258 |