Quillot Mathias / volia

Browse Code »

Commit 660d9960f95ade5bb2446df6177425097c9b71a9

Authored by quillotm 2021-08-16 23:43:16 +0200

1 parent 78e6974959

Exists in master

Adding n init parameters

Showing 2 changed files with 53 additions and 28 deletions Inline Diff

volia/clustering.py
volia/clustering_modules/kmeans_mahalanobis.py

volia/clustering.py

Diff comments View file @ 660d996

 import argparse
 from os import path, mkdir
 from utils import SubCommandRunner
 from core.data import read_features, read_lst, read_labels
 import numpy as np
 from sklearn.cluster import KMeans
 import pickle
 from clustering_modules.kmeans import kmeans
 from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
 import core.measures
 import json
 CLUSTERING_METHODS = {
     "k-means": kmeans(),
     "k-means-mahalanobis": kmeansMahalanobis()
 }
 EVALUATION_METHODS = {
     "entropy": core.measures.entropy_score,
     "purity": core.measures.purity_score,
     "v-measure": v_measure_score,
     "homogeneity": homogeneity_score,
     "completeness": completeness_score,
 }
 def disequilibrium_run():
     pass
 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
     """
     @param measure:
     @param features:
     @param lst:
     @param truelabels:
     @param model:
     @param modeltype:
     @return:
     """
     module = CLUSTERING_METHODS[modeltype]
     module.load(model)
     eval = {}
     for ms in measure:
         evaluation = EVALUATION_METHODS[ms]
         feats_dict = read_features(features)
         labels_dict = read_labels(truelabels)
         lst_dict = read_lst(lst)
         lst_keys = [key for key in lst_dict]
         feats = np.asarray([feats_dict[key] for key in lst_keys])
         Y_pred = module.predict(feats)
         Y_truth = [labels_dict[key][0] for key in lst_keys]
         le = LabelEncoder()
         le.fit(Y_truth)
         Y_truth = le.transform(Y_truth)
         eval[ms] = evaluation(Y_truth, Y_pred)
     print(json.dumps(eval))
 def kmeans_run(features: str,
                lst: str,
                k:int,
                kmax: int,
                klist,
                maxiter: int,
                ninit: int,
                output: str,
                tol: float,
                debug: bool = False,
                mahalanobis: str = False):
     """
     @param features: output features
     @param lst: list file
     @param k: k (kmin if kmax specified)
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
     @param mahalanobis: distance option of k-means.
     """
+    json_content = locals().copy()
+    def fit_model(k: int, output_file):
+        if debug:
+            print(f"Computing clustering with k={k}")
+        model = CLUSTERING_METHODS["k-means"]
+        if mahalanobis:
+            if debug:
+                print("Mahalanobis activated")
+            model = CLUSTERING_METHODS["k-means-mahalanobis"]
+        model.fit(X, k, tol, ninit, maxiter, debug)
+        model.save(output_file)
+        json_content["models"].append({
+            "model_file": output_file,
+            "k": k,
+        })
+    json_content["models"] = []
     # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
     X = np.asarray([features_dict[x] for x in lst_dict])
     # Exception cases
     if kmax is None and klist is None and path.isdir(output):
         raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
     if (kmax is not None or klist is not None) and path.isfile(output):
         raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
     # Mono value case
     if kmax is None and klist is None:
-        if debug:
+        fit_model(k, output)
-            print(f"Computing clustering with k={k}")
-        model = CLUSTERING_METHODS["k-means"]
-        if mahalanobis:
-            model = CLUSTERING_METHODS["k-means-mahalanobis"]
-        model.fit(X, k, tol, maxiter, debug)
-        model.save(output)
     # Multi values case with kmax
     if kmax is not None:
         if not path.isdir(output):
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
-            model = CLUSTERING_METHODS["k-means"]
+            fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
-            if mahalanobis:
-                model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, i, tol, maxiter, debug)
-            model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
     # Second multi values case with klist
     if klist is not None:
         if not path.isdir(output):
             mkdir(output)
         for k in klist:
             k = int(k)
-            model = CLUSTERING_METHODS["k-means"]
+            fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
-            if mahalanobis:
-                model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, k, tol, maxiter, debug)
-            model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
-    # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
+    print(json_content)
-    # etc. (what distance, what parameters etc)
+    # TODO: compute loss with k-means mahalanobis.
-    # TODO: Move example data into a directory.
-    # TODO: Add example receipts
     # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
     subparsers = parser.add_subparsers(title="action")
     # kmeans
     parser_kmeans = subparsers.add_parser(
         "kmeans", help="Compute clustering using k-means algorithm")
     parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
     parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
     parser_kmeans.add_argument("-k", default=2, type=int,
                                help="number of clusters to compute. It is kmin if kmax is specified.")
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
     parser_kmeans.add_argument("--maxiter",
                                type=int,
                                default=300,
                                help="Max number of iteration before stoping if not converging")
     parser_kmeans.add_argument("--ninit",
                                type=int,
                                default=10,
                                help="Number of time the k-means algorithm will be run with different centroid seeds.")
     parser_kmeans.add_argument("--tol",
                                type=float,
                                default=0.0001,
                                help="Tolerance to finish of distance between centroids and their updates.")
     parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
     parser_kmeans.add_argument("--mahalanobis", action="store_true")
     parser_kmeans.set_defaults(which="kmeans")
     # measure
     parser_measure = subparsers.add_parser(
         "measure", help="compute the entropy")
     parser_measure.add_argument("--measure",
                                 required=True,
                                 nargs="+",
                                 choices=[key for key in EVALUATION_METHODS],
                                 help="...")
     parser_measure.add_argument("--features", required=True, type=str, help="...")
     parser_measure.add_argument("--lst", required=True, type=str, help="...")
     parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
     parser_measure.add_argument("--model", required=True, type=str, help="...")
     parser_measure.add_argument("--modeltype",
                                 required=True,
                                 choices=[key for key in CLUSTERING_METHODS],
                                 help="type of model for learning")
     parser_measure.set_defaults(which="measure")
     # disequilibrium
     parser_disequilibrium = subparsers.add_parser(
         "disequilibrium", help="...")
     parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model-type",
                                 required=True,

volia/clustering_modules/kmeans_mahalanobis.py

Diff comments View file @ 660d996

 from sklearn.cluster import KMeans
 import pickle
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 from abstract_clustering import AbstractClustering
 class kmeansMahalanobis():
     def __init__(self):
         """
         """
         self.C = None
         self.L = None
         self.K = None
     def predict(self, features):
         """
         @param features:
         @return:
         """
         N = features.shape[0]
         distances = np.zeros((N, self.K))
         for n in range(N):
             for k in range(self.K):
                 distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
         closest_cluster = np.argmin(distances, axis=1)
         return closest_cluster
     def load(self, model_path):
         """
         @param model_path:
         @return:
         """
         data = None
         with open(model_path, "rb") as f:
             data = pickle.load(f)
         if data is None:
             raise Exception("Le modèle n'a pas pu être chargé")
         else:
             self.C = data["C"]
             self.L = data["L"]
             self.K = data["K"]
     def save(self, modelpath: str):
         """
         @param modelpath:
         @return:
         """
         data = {
             "C": self.C,
             "L": self.L,
             "K": self.K
         }
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
-    def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
+    def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
-        self._train(features, k, tol, maxiter, debug)
+        results = []
+        for i in range(ninit):
+            results.append(self._train(features, k, tol, maxiter, debug))
+        losses = [v["loss"] for v in results]
+        best = results[losses.index(min(losses))]
+        if debug:
+            print(f"best: {best['loss']} loss")
+        self.C = best["C"]
+        self.L = best["L"]
+        self.K = best["K"]
     def _initialize_model(self, X, number_clusters):
         d = X.shape[1]
         C = X[np.random.choice(X.shape[0], number_clusters)]
         L = np.zeros((number_clusters, d, d))
         for k in range(number_clusters):
             L[k] = np.identity(d)
         return C, L
     def _dist(self, a, b, l):
         '''
         Distance euclidienne
         '''
         a = np.reshape(a, (-1, 1))
         b = np.reshape(b, (-1, 1))
         result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
         return result
     def _plot_iteration(self, iteration, points, clusters, centers):
         fig = plt.figure()
         ax = fig.add_subplot(111)
         scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
         #for center in centers:
         #    ax.scatter(center[0], center[1], s=50, c='red', marker='+')
         ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
         ax.set_xlabel('x')
         ax.set_ylabel('y')
         plt.colorbar(scatter)
         #plt.ylim(0, 1)
         #plt.xlim(0, 1)
         plt.savefig("test_" + str(iteration) + ".pdf")
     def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
         X = features
         N = X.shape[0]
         d = X.shape[1]
-        X_embedded = None
         C, L = self._initialize_model(X, K)
         self.C = C
         self.L = L
         self.K = K
         end_algo = False
         i = 0
         while not end_algo:
             if debug:
                 print("Iteration: ", i)
             # Calcul matrix distance
-            distances = np.zeros((N, K))
+            distances = np.zeros((N, self.K))
             for n in range(N):
                 for k in range(self.K):
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
             closest_cluster = np.argmin(distances, axis=1)
+            loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
+            if debug:
+                print(f"loss {loss}")
             # -- Debug tool ----------------------
             if debug and i % 10 == 0:
                 # TSNE if needed
                 X_embedded = np.concatenate((X, self.C), axis=0)
                 if d > 2:
                     X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
                 # Then plot
                 self._plot_iteration(
                     i,
                     X_embedded[:X.shape[0]],
                     closest_cluster,
                     X_embedded[X.shape[0]:]
                 )
             # ------------------------------------
             old_c = self.C.copy()
             for k in range(K):
                 # Find subset of X with values closed to the centroid c_k.
                 X_sub = np.where(closest_cluster == k)
                 X_sub = np.take(X, X_sub[0], axis=0)
                 if X_sub.shape[0] == 0:
                     continue
                 np.mean(X_sub, axis=0)
                 C_new = np.mean(X_sub, axis=0)
                 # -- COMPUTE NEW LAMBDA (here named K) --
                 K_new = np.zeros((L.shape[1], L.shape[2]))
                 for x in X_sub:
                     x = np.reshape(x, (-1, 1))
                     c_tmp = np.reshape(C_new, (-1, 1))
                     K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
                 K_new = K_new / X_sub.shape[0]
                 K_new = np.linalg.pinv(K_new)
                 #if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
                 #    end_algo = False
                 self.C[k] = C_new
                 self.L[k] = K_new
             diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
             if diff > tol:
                 end_algo = False
                 if debug:
                     print(f"{diff}")
-            elif debug:
+            else:
-                print(f"Tolerance threshold {tol} reached with diff {diff}")
+                if debug:
+                    print(f"Tolerance threshold {tol} reached with diff {diff}")
                 end_algo = True
             i = i + 1
             if i > maxiter:
                 end_algo = True
                 if debug:
                     print(f"Iteration {maxiter} reached")
+        return {
+            "loss": loss,
+            "C": self.C,
+            "K": self.K,
+            "L": self.L