From ed89325d5d02f6e7878e3fd52498c8ad1ca653be Mon Sep 17 00:00:00 2001
From: quillotm <mathias.quillot@univ-avignon.fr>
Date: Mon, 16 Aug 2021 15:57:59 +0200
Subject: [PATCH] Now, we can give more parameters to k-means command.
 Mahalanobis was tested and seems to work well. Need more tests.

---
 volia/clustering.py                            | 42 +++++++++++++++++----
 volia/clustering_modules/kmeans.py             |  4 +-
 volia/clustering_modules/kmeans_mahalanobis.py | 51 +++++++++++++++++---------
 3 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/volia/clustering.py b/volia/clustering.py
index 9b92ff4..8406aed 100644
--- a/volia/clustering.py
+++ b/volia/clustering.py
@@ -67,7 +67,17 @@ def measure_run(measure: str, features: str, lst: str, truelabels: str, model: s
     print(json.dumps(eval))
 
 
-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
+def kmeans_run(features: str,
+               lst: str,
+               k:int,
+               kmax: int,
+               klist,
+               maxiter: int,
+               ninit: int,
+               output: str,
+               tol: float,
+               debug: bool = False,
+               mahalanobis: str = False):
     """
 
     @param features: output features
@@ -92,12 +102,12 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma
 
     # Mono value case
     if kmax is None and klist is None:
-        print(f"Computing clustering with k={k}")
+        if debug:
+            print(f"Computing clustering with k={k}")
         model = CLUSTERING_METHODS["k-means"]
         if mahalanobis:
-            print("Computing with mahalanobis distance")
             model = CLUSTERING_METHODS["k-means-mahalanobis"]
-        model.fit(X, k)
+        model.fit(X, k, tol, maxiter, debug)
         model.save(output)
 
     # Multi values case with kmax
@@ -109,7 +119,7 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, i)
+            model.fit(X, i, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
 
     # Second multi values case with klist
@@ -120,11 +130,16 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma
             k = int(k)
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
-                print("Computing with mahalanobis distance")
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, k)
+            model.fit(X, k, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
 
+    # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
+    # etc. (what distance, what parameters etc)
+    # TODO: Move example data into a directory.
+    # TODO: Add example receipts
+    # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
+
 
 if __name__ == "__main__":
     # Main parser
@@ -142,6 +157,19 @@ if __name__ == "__main__":
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
+    parser_kmeans.add_argument("--maxiter",
+                               type=int,
+                               default=300,
+                               help="Max number of iteration before stoping if not converging")
+    parser_kmeans.add_argument("--ninit",
+                               type=int,
+                               default=10,
+                               help="Number of time the k-means algorithm will be run with different centroid seeds.")
+    parser_kmeans.add_argument("--tol",
+                               type=float,
+                               default=0.0001,
+                               help="Tolerance to finish of distance between centroids and their updates.")
+    parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
diff --git a/volia/clustering_modules/kmeans.py b/volia/clustering_modules/kmeans.py
index 2e8a95a..23ad00a 100644
--- a/volia/clustering_modules/kmeans.py
+++ b/volia/clustering_modules/kmeans.py
@@ -33,11 +33,11 @@ class kmeans():
         with open(model_path, "wb") as f:
             pickle.dump(self.kmeans_model, f)
 
-    def fit(self, features, k: int):
+    def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
         """
 
         @param features:
         @param k:
         @return:
         """
-        self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
+        self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
diff --git a/volia/clustering_modules/kmeans_mahalanobis.py b/volia/clustering_modules/kmeans_mahalanobis.py
index 6848cdd..f09cb43 100644
--- a/volia/clustering_modules/kmeans_mahalanobis.py
+++ b/volia/clustering_modules/kmeans_mahalanobis.py
@@ -37,8 +37,8 @@ class kmeansMahalanobis():
         @return:
         """
         data = None
-        with open(model_path):
-            data = pickle.load()
+        with open(model_path, "rb") as f:
+            data = pickle.load(f)
         if data is None:
             raise Exception("Le modèle n'a pas pu être chargé")
         else:
@@ -60,8 +60,8 @@ class kmeansMahalanobis():
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
 
-    def fit(self, features, K: int):
-        self._train(features, K)
+    def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
+        self._train(features, k, tol, maxiter, debug)
 
     def _initialize_model(self, X, number_clusters):
         d = X.shape[1]
@@ -96,11 +96,12 @@ class kmeansMahalanobis():
         #plt.xlim(0, 1)
         plt.savefig("test_" + str(iteration) + ".pdf")
 
-    def _train(self, features, K: int):
+    def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
         X = features
         N = X.shape[0]
         d = X.shape[1]
 
+        X_embedded = None
         C, L = self._initialize_model(X, K)
         self.C = C
         self.L = L
@@ -109,9 +110,9 @@ class kmeansMahalanobis():
         end_algo = False
         i = 0
         while not end_algo:
-            if i == 10:
-                exit(1)
-            print("Iteration: ", i)
+            if debug:
+                print("Iteration: ", i)
+
             # Calcul matrix distance
             distances = np.zeros((N, K))
 
@@ -119,11 +120,14 @@ class kmeansMahalanobis():
                 for k in range(self.K):
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
             closest_cluster = np.argmin(distances, axis=1)
-            if i % 1 == 0:
-                # -- Debug tool ----------------------
-                # TSNE
-                #X_embedded = np.concatenate((X, self.C), axis=0)
-                X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+
+            # -- Debug tool ----------------------
+            if debug and i % 10 == 0:
+                # TSNE if needed
+                X_embedded = np.concatenate((X, self.C), axis=0)
+                if d > 2:
+                    X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+
                 # Then plot
                 self._plot_iteration(
                     i,
@@ -131,9 +135,9 @@ class kmeansMahalanobis():
                     closest_cluster,
                     X_embedded[X.shape[0]:]
                 )
-                # ------------------------------------
+            # ------------------------------------
 
-            end_algo = True
+            old_c = self.C.copy()
             for k in range(K):
                 # Find subset of X with values closed to the centroid c_k.
                 X_sub = np.where(closest_cluster == k)
@@ -152,8 +156,21 @@ class kmeansMahalanobis():
                 K_new = K_new / X_sub.shape[0]
                 K_new = np.linalg.pinv(K_new)
 
-                if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
-                    end_algo = False
+                #if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
+                #    end_algo = False
                 self.C[k] = C_new
                 self.L[k] = K_new
+
+            diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
+            if diff > tol:
+                end_algo = False
+                if debug:
+                    print(f"{diff}")
+            elif debug:
+                print(f"Tolerance threshold {tol} reached with diff {diff}")
+                end_algo = True
             i = i + 1
+            if i > maxiter:
+                end_algo = True
+                if debug:
+                    print(f"Iteration {maxiter} reached")
-- 
1.8.2.3