From 3b960e0f1923a5a9427417aa75b5fb2eef90657c Mon Sep 17 00:00:00 2001
From: quillotm <mathias.quillot@univ-avignon.fr>
Date: Mon, 9 Aug 2021 11:12:54 +0200
Subject: [PATCH] Clustering command allows you to compute kmeans specifying k,
 kmin and kmax or a list of k-values.

---
 volia/clustering.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 volia/clustering.py

diff --git a/volia/clustering.py b/volia/clustering.py
new file mode 100644
index 0000000..7b2359f
--- /dev/null
+++ b/volia/clustering.py
@@ -0,0 +1,90 @@
+import argparse
+from os import path, mkdir
+from utils import SubCommandRunner
+from core.data import read_features, read_lst
+
+import numpy as np
+from sklearn.cluster import KMeans
+import pickle
+
+
+def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
+    """
+
+    @param features: output features
+    @param lst: list file
+    @param k: k (kmin if kmax specified)
+    @param kmax: maximum k to compute
+    @param klist: list of k values to compute, ignore k value
+    @param output: output file if kmax not specified, else, output directory
+    """
+    # -- READE FILES --
+    features_dict = read_features(features)
+    lst_dict = read_lst(lst)
+    X = np.asarray([features_dict[x] for x in lst_dict])
+
+    # Exception cases
+    if kmax is None and klist is None and path.isdir(output):
+        raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
+
+    if (kmax is not None or klist is not None) and path.isfile(output):
+        raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
+
+    # Mono value case
+    if kmax is None and klist is None:
+        print(f"Computing clustering with k={k}")
+        kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
+        preds = kmeans.predict(X)
+        pickle.dump(kmeans, open(output, "wb"))
+
+    # Multi values case with kmax
+    if kmax is not None:
+        if not path.isdir(output):
+            mkdir(output)
+        Ks = range(k, kmax + 1)
+        for i in Ks:
+            print(f"Computing clustering with k={i}")
+            kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
+            preds = kmeans.predict(X)
+            pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
+
+    # Second multi values case with klist
+    if klist is not None:
+        if not path.isdir(output):
+            mkdir(output)
+        for k in klist:
+            k = int(k)
+            print(f"Computing clustering with k={k}")
+            kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
+            preds = kmeans.predict(X)
+            pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
+
+
+if __name__ == "__main__":
+    # Main parser
+    parser = argparse.ArgumentParser(description="Clustering methods to apply")
+    subparsers = parser.add_subparsers(title="action")
+
+    # kmeans
+    parser_kmeans = subparsers.add_parser(
+        "kmeans", help="Compute clustering using k-means algorithm")
+
+    parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
+    parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
+    parser_kmeans.add_argument("-k", default=2, type=int,
+                               help="number of clusters to compute. It is kmin if kmax is specified.")
+    parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
+    parser_kmeans.add_argument("--klist", nargs="+",
+                               help="List of k values to test. As kmax, activate the multi values mod.")
+    parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
+    parser_kmeans.set_defaults(which="kmeans")
+
+    # Parse
+    args = parser.parse_args()
+
+    # Run commands
+    runner = SubCommandRunner({
+        "kmeans": kmeans_run
+    })
+
+    runner.run(args.which, args.__dict__, remove="which")
\ No newline at end of file
-- 
1.8.2.3