From 4152e83df25ef19c8b048592e9629911bcf77e1a Mon Sep 17 00:00:00 2001 From: quillotm Date: Fri, 13 Aug 2021 17:03:44 +0200 Subject: [PATCH] Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve some problems with identity matrix usage and infinite or nan values. --- volia/clustering.py | 41 ++++--- volia/clustering_modules/kmeans.py | 33 ++++- volia/clustering_modules/kmeans_mahalanobis.py | 159 +++++++++++++++++++++++++ 3 files changed, 216 insertions(+), 17 deletions(-) create mode 100644 volia/clustering_modules/kmeans_mahalanobis.py diff --git a/volia/clustering.py b/volia/clustering.py index 9518fa8..9b92ff4 100644 --- a/volia/clustering.py +++ b/volia/clustering.py @@ -6,6 +6,7 @@ import numpy as np from sklearn.cluster import KMeans import pickle from clustering_modules.kmeans import kmeans +from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis from sklearn.preprocessing import LabelEncoder from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score @@ -15,7 +16,8 @@ import json CLUSTERING_METHODS = { - "k-means": kmeans() + "k-means": kmeans(), + "k-means-mahalanobis": kmeansMahalanobis() } EVALUATION_METHODS = { @@ -65,8 +67,7 @@ def measure_run(measure: str, features: str, lst: str, truelabels: str, model: s print(json.dumps(eval)) - -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): +def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): """ @param features: output features @@ -75,6 +76,7 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): @param kmax: maximum k to compute @param klist: list of k values to compute, ignore k value @param output: output file if kmax not specified, else, output directory + @param mahalanobis: distance option of k-means. """ # -- READ FILES -- features_dict = read_features(features) @@ -91,9 +93,12 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): # Mono value case if kmax is None and klist is None: print(f"Computing clustering with k={k}") - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) - preds = kmeans.predict(X) - pickle.dump(kmeans, open(output, "wb")) + model = CLUSTERING_METHODS["k-means"] + if mahalanobis: + print("Computing with mahalanobis distance") + model = CLUSTERING_METHODS["k-means-mahalanobis"] + model.fit(X, k) + model.save(output) # Multi values case with kmax if kmax is not None: @@ -101,10 +106,11 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): mkdir(output) Ks = range(k, kmax + 1) for i in Ks: - print(f"Computing clustering with k={i}") - kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) - preds = kmeans.predict(X) - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) + model = CLUSTERING_METHODS["k-means"] + if mahalanobis: + model = CLUSTERING_METHODS["k-means-mahalanobis"] + model.fit(X, i) + model.save(path.join(output, "clustering_" + str(i) + ".pkl")) # Second multi values case with klist if klist is not None: @@ -112,10 +118,12 @@ def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): mkdir(output) for k in klist: k = int(k) - print(f"Computing clustering with k={k}") - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) - preds = kmeans.predict(X) - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) + model = CLUSTERING_METHODS["k-means"] + if mahalanobis: + print("Computing with mahalanobis distance") + model = CLUSTERING_METHODS["k-means-mahalanobis"] + model.fit(X, k) + model.save(path.join(output, "clustering_" + str(k) + ".pkl")) if __name__ == "__main__": @@ -134,7 +142,10 @@ if __name__ == "__main__": parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") parser_kmeans.add_argument("--klist", nargs="+", help="List of k values to test. As kmax, activate the multi values mod.") - parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") + parser_kmeans.add_argument("--output", + default=".kmeans", + help="output file if only k. Output directory if multiple kmax specified.") + parser_kmeans.add_argument("--mahalanobis", action="store_true") parser_kmeans.set_defaults(which="kmeans") # measure diff --git a/volia/clustering_modules/kmeans.py b/volia/clustering_modules/kmeans.py index 2e58f70..2e8a95a 100644 --- a/volia/clustering_modules/kmeans.py +++ b/volia/clustering_modules/kmeans.py @@ -8,7 +8,36 @@ class kmeans(): self.kmeans_model = None def predict(self, features): + """ + + @param features: + @return: + """ return self.kmeans_model.predict(features) - def load(self, model_path): - self.kmeans_model = pickle.load(open(model_path, "rb")) + def load(self, model_path: str): + """ + + @param model_path: + @return: + """ + with open(model_path, "rb") as f: + self.kmeans_model = pickle.load(f) + + def save(self, model_path: str): + """ + + @param model_path: + @return: + """ + with open(model_path, "wb") as f: + pickle.dump(self.kmeans_model, f) + + def fit(self, features, k: int): + """ + + @param features: + @param k: + @return: + """ + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) diff --git a/volia/clustering_modules/kmeans_mahalanobis.py b/volia/clustering_modules/kmeans_mahalanobis.py new file mode 100644 index 0000000..0779cf8 --- /dev/null +++ b/volia/clustering_modules/kmeans_mahalanobis.py @@ -0,0 +1,159 @@ + + +from sklearn.cluster import KMeans +import pickle +import numpy as np +import matplotlib.pyplot as plt +from sklearn.manifold import TSNE +from abstract_clustering import AbstractClustering + +class kmeansMahalanobis(): + def __init__(self): + """ + + """ + self.C = None + self.L = None + self.K = None + + def predict(self, features): + """ + + @param features: + @return: + """ + N = features.shape[0] + distances = np.zeros((N, self.K)) + for n in range(N): + for k in range(self.K): + distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) + print(distances) + closest_cluster = np.argmin(distances, axis=1) + return closest_cluster + + def load(self, model_path): + """ + + @param model_path: + @return: + """ + data = None + with open(model_path): + data = pickle.load() + if data is None: + raise Exception("Le modèle n'a pas pu être chargé") + else: + self.C = data["C"] + self.L = data["L"] + self.K = data["K"] + + def save(self, modelpath: str): + """ + + @param modelpath: + @return: + """ + data = { + "C": self.C, + "L": self.L, + "K": self.K + } + with open(modelpath, "wb") as f: + pickle.dump(data, f) + + def fit(self, features, K: int): + self._train(features, K) + + def _initialize_model(self, X, number_clusters): + d = X.shape[1] + C = X[np.random.choice(X.shape[0], number_clusters)] + L = np.zeros((number_clusters, d, d)) + for k in range(number_clusters): + L[k] = np.identity(d) + return C, L + + def _dist(self, a, b, l): + ''' + Distance euclidienne + ''' + a = np.reshape(a, (-1, 1)) + b = np.reshape(b, (-1, 1)) + result = np.transpose(a - b).dot(l).dot(a-b)[0][0] + return result + + def _plot_iteration(self, iteration, points, clusters, centers): + fig = plt.figure() + ax = fig.add_subplot(111) + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) + + #for center in centers: + # ax.scatter(center[0], center[1], s=50, c='red', marker='+') + ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') + + ax.set_xlabel('x') + ax.set_ylabel('y') + plt.colorbar(scatter) + #plt.ylim(0, 1) + #plt.xlim(0, 1) + plt.savefig("test_" + str(iteration) + ".pdf") + + def _train(self, features, K: int): + X = features + N = X.shape[0] + d = X.shape[1] + + C, L = self._initialize_model(X, K) + self.C = C + self.L = L + self.K = K + + end_algo = False + i = 0 + while not end_algo: + if i == 10: + exit(1) + print("Iteration: ", i) + # Calcul matrix distance + distances = np.zeros((N, K)) + + for n in range(N): + for k in range(self.K): + distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) + print(distances) + closest_cluster = np.argmin(distances, axis=1) + if i % 1 == 0: + # -- Debug tool ---------------------- + # TSNE + #X_embedded = np.concatenate((X, self.C), axis=0) + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + # Then plot + self._plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] + ) + # ------------------------------------ + + end_algo = True + for k in range(K): + # Find subset of X with values closed to the centroid c_k. + X_sub = np.where(closest_cluster == k) + X_sub = np.take(X, X_sub[0], axis=0) + np.mean(X_sub, axis=0) + C_new = np.mean(X_sub, axis=0) + + # -- COMPUTE NEW LAMBDA (here named K) -- + K_new = np.zeros((L.shape[1], L.shape[2])) + for x in X_sub: + x = np.reshape(x, (-1, 1)) + c_tmp = np.reshape(C_new, (-1, 1)) + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) + K_new = K_new / X_sub.shape[0] + K_new = np.linalg.inv(K_new) + + if end_algo and (not (self.C[k] == C_new).all()): # If the same stop + end_algo = False + self.C[k] = C_new + self.L[k] = K_new + i = i + 1 -- 1.8.2.3