From 3b960e0f1923a5a9427417aa75b5fb2eef90657c Mon Sep 17 00:00:00 2001 From: quillotm Date: Mon, 9 Aug 2021 11:12:54 +0200 Subject: [PATCH] Clustering command allows you to compute kmeans specifying k, kmin and kmax or a list of k-values. --- volia/clustering.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 volia/clustering.py diff --git a/volia/clustering.py b/volia/clustering.py new file mode 100644 index 0000000..7b2359f --- /dev/null +++ b/volia/clustering.py @@ -0,0 +1,90 @@ +import argparse +from os import path, mkdir +from utils import SubCommandRunner +from core.data import read_features, read_lst + +import numpy as np +from sklearn.cluster import KMeans +import pickle + + +def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): + """ + + @param features: output features + @param lst: list file + @param k: k (kmin if kmax specified) + @param kmax: maximum k to compute + @param klist: list of k values to compute, ignore k value + @param output: output file if kmax not specified, else, output directory + """ + # -- READE FILES -- + features_dict = read_features(features) + lst_dict = read_lst(lst) + X = np.asarray([features_dict[x] for x in lst_dict]) + + # Exception cases + if kmax is None and klist is None and path.isdir(output): + raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") + + if (kmax is not None or klist is not None) and path.isfile(output): + raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") + + # Mono value case + if kmax is None and klist is None: + print(f"Computing clustering with k={k}") + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) + preds = kmeans.predict(X) + pickle.dump(kmeans, open(output, "wb")) + + # Multi values case with kmax + if kmax is not None: + if not path.isdir(output): + mkdir(output) + Ks = range(k, kmax + 1) + for i in Ks: + print(f"Computing clustering with k={i}") + kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) + preds = kmeans.predict(X) + pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) + + # Second multi values case with klist + if klist is not None: + if not path.isdir(output): + mkdir(output) + for k in klist: + k = int(k) + print(f"Computing clustering with k={k}") + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) + preds = kmeans.predict(X) + pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) + + +if __name__ == "__main__": + # Main parser + parser = argparse.ArgumentParser(description="Clustering methods to apply") + subparsers = parser.add_subparsers(title="action") + + # kmeans + parser_kmeans = subparsers.add_parser( + "kmeans", help="Compute clustering using k-means algorithm") + + parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") + parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") + parser_kmeans.add_argument("-k", default=2, type=int, + help="number of clusters to compute. It is kmin if kmax is specified.") + parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") + parser_kmeans.add_argument("--klist", nargs="+", + help="List of k values to test. As kmax, activate the multi values mod.") + parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") + parser_kmeans.set_defaults(which="kmeans") + + # Parse + args = parser.parse_args() + + # Run commands + runner = SubCommandRunner({ + "kmeans": kmeans_run + }) + + runner.run(args.which, args.__dict__, remove="which") \ No newline at end of file -- 1.8.2.3