From ed89325d5d02f6e7878e3fd52498c8ad1ca653be Mon Sep 17 00:00:00 2001 From: quillotm Date: Mon, 16 Aug 2021 15:57:59 +0200 Subject: [PATCH] Now, we can give more parameters to k-means command. Mahalanobis was tested and seems to work well. Need more tests. --- volia/clustering.py | 42 +++++++++++++++++---- volia/clustering_modules/kmeans.py | 4 +- volia/clustering_modules/kmeans_mahalanobis.py | 51 +++++++++++++++++--------- 3 files changed, 71 insertions(+), 26 deletions(-) diff --git a/volia/clustering.py b/volia/clustering.py index 9b92ff4..8406aed 100644 --- a/volia/clustering.py +++ b/volia/clustering.py @@ -67,7 +67,17 @@ def measure_run(measure: str, features: str, lst: str, truelabels: str, model: s print(json.dumps(eval)) -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): +def kmeans_run(features: str, + lst: str, + k:int, + kmax: int, + klist, + maxiter: int, + ninit: int, + output: str, + tol: float, + debug: bool = False, + mahalanobis: str = False): """ @param features: output features @@ -92,12 +102,12 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma # Mono value case if kmax is None and klist is None: - print(f"Computing clustering with k={k}") + if debug: + print(f"Computing clustering with k={k}") model = CLUSTERING_METHODS["k-means"] if mahalanobis: - print("Computing with mahalanobis distance") model = CLUSTERING_METHODS["k-means-mahalanobis"] - model.fit(X, k) + model.fit(X, k, tol, maxiter, debug) model.save(output) # Multi values case with kmax @@ -109,7 +119,7 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma model = CLUSTERING_METHODS["k-means"] if mahalanobis: model = CLUSTERING_METHODS["k-means-mahalanobis"] - model.fit(X, i) + model.fit(X, i, tol, maxiter, debug) model.save(path.join(output, "clustering_" + str(i) + ".pkl")) # Second multi values case with klist @@ -120,11 +130,16 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, ma k = int(k) model = CLUSTERING_METHODS["k-means"] if mahalanobis: - print("Computing with mahalanobis distance") model = CLUSTERING_METHODS["k-means-mahalanobis"] - model.fit(X, k) + model.fit(X, k, tol, maxiter, debug) model.save(path.join(output, "clustering_" + str(k) + ".pkl")) + # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ? + # etc. (what distance, what parameters etc) + # TODO: Move example data into a directory. + # TODO: Add example receipts + # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm. + if __name__ == "__main__": # Main parser @@ -142,6 +157,19 @@ if __name__ == "__main__": parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") parser_kmeans.add_argument("--klist", nargs="+", help="List of k values to test. As kmax, activate the multi values mod.") + parser_kmeans.add_argument("--maxiter", + type=int, + default=300, + help="Max number of iteration before stoping if not converging") + parser_kmeans.add_argument("--ninit", + type=int, + default=10, + help="Number of time the k-means algorithm will be run with different centroid seeds.") + parser_kmeans.add_argument("--tol", + type=float, + default=0.0001, + help="Tolerance to finish of distance between centroids and their updates.") + parser_kmeans.add_argument("--debug", action="store_true") parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") diff --git a/volia/clustering_modules/kmeans.py b/volia/clustering_modules/kmeans.py index 2e8a95a..23ad00a 100644 --- a/volia/clustering_modules/kmeans.py +++ b/volia/clustering_modules/kmeans.py @@ -33,11 +33,11 @@ class kmeans(): with open(model_path, "wb") as f: pickle.dump(self.kmeans_model, f) - def fit(self, features, k: int): + def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False): """ @param features: @param k: @return: """ - self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features) diff --git a/volia/clustering_modules/kmeans_mahalanobis.py b/volia/clustering_modules/kmeans_mahalanobis.py index 6848cdd..f09cb43 100644 --- a/volia/clustering_modules/kmeans_mahalanobis.py +++ b/volia/clustering_modules/kmeans_mahalanobis.py @@ -37,8 +37,8 @@ class kmeansMahalanobis(): @return: """ data = None - with open(model_path): - data = pickle.load() + with open(model_path, "rb") as f: + data = pickle.load(f) if data is None: raise Exception("Le modèle n'a pas pu être chargé") else: @@ -60,8 +60,8 @@ class kmeansMahalanobis(): with open(modelpath, "wb") as f: pickle.dump(data, f) - def fit(self, features, K: int): - self._train(features, K) + def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False): + self._train(features, k, tol, maxiter, debug) def _initialize_model(self, X, number_clusters): d = X.shape[1] @@ -96,11 +96,12 @@ class kmeansMahalanobis(): #plt.xlim(0, 1) plt.savefig("test_" + str(iteration) + ".pdf") - def _train(self, features, K: int): + def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False): X = features N = X.shape[0] d = X.shape[1] + X_embedded = None C, L = self._initialize_model(X, K) self.C = C self.L = L @@ -109,9 +110,9 @@ class kmeansMahalanobis(): end_algo = False i = 0 while not end_algo: - if i == 10: - exit(1) - print("Iteration: ", i) + if debug: + print("Iteration: ", i) + # Calcul matrix distance distances = np.zeros((N, K)) @@ -119,11 +120,14 @@ class kmeansMahalanobis(): for k in range(self.K): distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) closest_cluster = np.argmin(distances, axis=1) - if i % 1 == 0: - # -- Debug tool ---------------------- - # TSNE - #X_embedded = np.concatenate((X, self.C), axis=0) - X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + + # -- Debug tool ---------------------- + if debug and i % 10 == 0: + # TSNE if needed + X_embedded = np.concatenate((X, self.C), axis=0) + if d > 2: + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + # Then plot self._plot_iteration( i, @@ -131,9 +135,9 @@ class kmeansMahalanobis(): closest_cluster, X_embedded[X.shape[0]:] ) - # ------------------------------------ + # ------------------------------------ - end_algo = True + old_c = self.C.copy() for k in range(K): # Find subset of X with values closed to the centroid c_k. X_sub = np.where(closest_cluster == k) @@ -152,8 +156,21 @@ class kmeansMahalanobis(): K_new = K_new / X_sub.shape[0] K_new = np.linalg.pinv(K_new) - if end_algo and (not (self.C[k] == C_new).all()): # If the same stop - end_algo = False + #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop + # end_algo = False self.C[k] = C_new self.L[k] = K_new + + diff = np.sum(np.absolute((self.C - old_c) / old_c * 100)) + if diff > tol: + end_algo = False + if debug: + print(f"{diff}") + elif debug: + print(f"Tolerance threshold {tol} reached with diff {diff}") + end_algo = True i = i + 1 + if i > maxiter: + end_algo = True + if debug: + print(f"Iteration {maxiter} reached") -- 1.8.2.3