Commit 4152e83df25ef19c8b048592e9629911bcf77e1a
1 parent
3c07f672ad
Exists in
master
Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …
…some problems with identity matrix usage and infinite or nan values.
Showing 3 changed files with 216 additions and 17 deletions Side-by-side Diff
volia/clustering.py
| ... | ... | @@ -6,6 +6,7 @@ |
| 6 | 6 | from sklearn.cluster import KMeans |
| 7 | 7 | import pickle |
| 8 | 8 | from clustering_modules.kmeans import kmeans |
| 9 | +from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | |
| 9 | 10 | |
| 10 | 11 | from sklearn.preprocessing import LabelEncoder |
| 11 | 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
| ... | ... | @@ -15,7 +16,8 @@ |
| 15 | 16 | |
| 16 | 17 | |
| 17 | 18 | CLUSTERING_METHODS = { |
| 18 | - "k-means": kmeans() | |
| 19 | + "k-means": kmeans(), | |
| 20 | + "k-means-mahalanobis": kmeansMahalanobis() | |
| 19 | 21 | } |
| 20 | 22 | |
| 21 | 23 | EVALUATION_METHODS = { |
| ... | ... | @@ -65,8 +67,7 @@ |
| 65 | 67 | print(json.dumps(eval)) |
| 66 | 68 | |
| 67 | 69 | |
| 68 | - | |
| 69 | -def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | |
| 70 | +def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): | |
| 70 | 71 | """ |
| 71 | 72 | |
| 72 | 73 | @param features: output features |
| ... | ... | @@ -75,6 +76,7 @@ |
| 75 | 76 | @param kmax: maximum k to compute |
| 76 | 77 | @param klist: list of k values to compute, ignore k value |
| 77 | 78 | @param output: output file if kmax not specified, else, output directory |
| 79 | + @param mahalanobis: distance option of k-means. | |
| 78 | 80 | """ |
| 79 | 81 | # -- READ FILES -- |
| 80 | 82 | features_dict = read_features(features) |
| ... | ... | @@ -91,9 +93,12 @@ |
| 91 | 93 | # Mono value case |
| 92 | 94 | if kmax is None and klist is None: |
| 93 | 95 | print(f"Computing clustering with k={k}") |
| 94 | - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | |
| 95 | - preds = kmeans.predict(X) | |
| 96 | - pickle.dump(kmeans, open(output, "wb")) | |
| 96 | + model = CLUSTERING_METHODS["k-means"] | |
| 97 | + if mahalanobis: | |
| 98 | + print("Computing with mahalanobis distance") | |
| 99 | + model = CLUSTERING_METHODS["k-means-mahalanobis"] | |
| 100 | + model.fit(X, k) | |
| 101 | + model.save(output) | |
| 97 | 102 | |
| 98 | 103 | # Multi values case with kmax |
| 99 | 104 | if kmax is not None: |
| ... | ... | @@ -101,10 +106,11 @@ |
| 101 | 106 | mkdir(output) |
| 102 | 107 | Ks = range(k, kmax + 1) |
| 103 | 108 | for i in Ks: |
| 104 | - print(f"Computing clustering with k={i}") | |
| 105 | - kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | |
| 106 | - preds = kmeans.predict(X) | |
| 107 | - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | |
| 109 | + model = CLUSTERING_METHODS["k-means"] | |
| 110 | + if mahalanobis: | |
| 111 | + model = CLUSTERING_METHODS["k-means-mahalanobis"] | |
| 112 | + model.fit(X, i) | |
| 113 | + model.save(path.join(output, "clustering_" + str(i) + ".pkl")) | |
| 108 | 114 | |
| 109 | 115 | # Second multi values case with klist |
| 110 | 116 | if klist is not None: |
| ... | ... | @@ -112,10 +118,12 @@ |
| 112 | 118 | mkdir(output) |
| 113 | 119 | for k in klist: |
| 114 | 120 | k = int(k) |
| 115 | - print(f"Computing clustering with k={k}") | |
| 116 | - kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | |
| 117 | - preds = kmeans.predict(X) | |
| 118 | - pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | |
| 121 | + model = CLUSTERING_METHODS["k-means"] | |
| 122 | + if mahalanobis: | |
| 123 | + print("Computing with mahalanobis distance") | |
| 124 | + model = CLUSTERING_METHODS["k-means-mahalanobis"] | |
| 125 | + model.fit(X, k) | |
| 126 | + model.save(path.join(output, "clustering_" + str(k) + ".pkl")) | |
| 119 | 127 | |
| 120 | 128 | |
| 121 | 129 | if __name__ == "__main__": |
| ... | ... | @@ -134,7 +142,10 @@ |
| 134 | 142 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
| 135 | 143 | parser_kmeans.add_argument("--klist", nargs="+", |
| 136 | 144 | help="List of k values to test. As kmax, activate the multi values mod.") |
| 137 | - parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | |
| 145 | + parser_kmeans.add_argument("--output", | |
| 146 | + default=".kmeans", | |
| 147 | + help="output file if only k. Output directory if multiple kmax specified.") | |
| 148 | + parser_kmeans.add_argument("--mahalanobis", action="store_true") | |
| 138 | 149 | parser_kmeans.set_defaults(which="kmeans") |
| 139 | 150 | |
| 140 | 151 | # measure |
volia/clustering_modules/kmeans.py
| ... | ... | @@ -8,8 +8,37 @@ |
| 8 | 8 | self.kmeans_model = None |
| 9 | 9 | |
| 10 | 10 | def predict(self, features): |
| 11 | + """ | |
| 12 | + | |
| 13 | + @param features: | |
| 14 | + @return: | |
| 15 | + """ | |
| 11 | 16 | return self.kmeans_model.predict(features) |
| 12 | 17 | |
| 13 | - def load(self, model_path): | |
| 14 | - self.kmeans_model = pickle.load(open(model_path, "rb")) | |
| 18 | + def load(self, model_path: str): | |
| 19 | + """ | |
| 20 | + | |
| 21 | + @param model_path: | |
| 22 | + @return: | |
| 23 | + """ | |
| 24 | + with open(model_path, "rb") as f: | |
| 25 | + self.kmeans_model = pickle.load(f) | |
| 26 | + | |
| 27 | + def save(self, model_path: str): | |
| 28 | + """ | |
| 29 | + | |
| 30 | + @param model_path: | |
| 31 | + @return: | |
| 32 | + """ | |
| 33 | + with open(model_path, "wb") as f: | |
| 34 | + pickle.dump(self.kmeans_model, f) | |
| 35 | + | |
| 36 | + def fit(self, features, k: int): | |
| 37 | + """ | |
| 38 | + | |
| 39 | + @param features: | |
| 40 | + @param k: | |
| 41 | + @return: | |
| 42 | + """ | |
| 43 | + self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) |
volia/clustering_modules/kmeans_mahalanobis.py
| 1 | + | |
| 2 | + | |
| 3 | +from sklearn.cluster import KMeans | |
| 4 | +import pickle | |
| 5 | +import numpy as np | |
| 6 | +import matplotlib.pyplot as plt | |
| 7 | +from sklearn.manifold import TSNE | |
| 8 | +from abstract_clustering import AbstractClustering | |
| 9 | + | |
| 10 | +class kmeansMahalanobis(): | |
| 11 | + def __init__(self): | |
| 12 | + """ | |
| 13 | + | |
| 14 | + """ | |
| 15 | + self.C = None | |
| 16 | + self.L = None | |
| 17 | + self.K = None | |
| 18 | + | |
| 19 | + def predict(self, features): | |
| 20 | + """ | |
| 21 | + | |
| 22 | + @param features: | |
| 23 | + @return: | |
| 24 | + """ | |
| 25 | + N = features.shape[0] | |
| 26 | + distances = np.zeros((N, self.K)) | |
| 27 | + for n in range(N): | |
| 28 | + for k in range(self.K): | |
| 29 | + distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) | |
| 30 | + print(distances) | |
| 31 | + closest_cluster = np.argmin(distances, axis=1) | |
| 32 | + return closest_cluster | |
| 33 | + | |
| 34 | + def load(self, model_path): | |
| 35 | + """ | |
| 36 | + | |
| 37 | + @param model_path: | |
| 38 | + @return: | |
| 39 | + """ | |
| 40 | + data = None | |
| 41 | + with open(model_path): | |
| 42 | + data = pickle.load() | |
| 43 | + if data is None: | |
| 44 | + raise Exception("Le modรจle n'a pas pu รชtre chargรฉ") | |
| 45 | + else: | |
| 46 | + self.C = data["C"] | |
| 47 | + self.L = data["L"] | |
| 48 | + self.K = data["K"] | |
| 49 | + | |
| 50 | + def save(self, modelpath: str): | |
| 51 | + """ | |
| 52 | + | |
| 53 | + @param modelpath: | |
| 54 | + @return: | |
| 55 | + """ | |
| 56 | + data = { | |
| 57 | + "C": self.C, | |
| 58 | + "L": self.L, | |
| 59 | + "K": self.K | |
| 60 | + } | |
| 61 | + with open(modelpath, "wb") as f: | |
| 62 | + pickle.dump(data, f) | |
| 63 | + | |
| 64 | + def fit(self, features, K: int): | |
| 65 | + self._train(features, K) | |
| 66 | + | |
| 67 | + def _initialize_model(self, X, number_clusters): | |
| 68 | + d = X.shape[1] | |
| 69 | + C = X[np.random.choice(X.shape[0], number_clusters)] | |
| 70 | + L = np.zeros((number_clusters, d, d)) | |
| 71 | + for k in range(number_clusters): | |
| 72 | + L[k] = np.identity(d) | |
| 73 | + return C, L | |
| 74 | + | |
| 75 | + def _dist(self, a, b, l): | |
| 76 | + ''' | |
| 77 | + Distance euclidienne | |
| 78 | + ''' | |
| 79 | + a = np.reshape(a, (-1, 1)) | |
| 80 | + b = np.reshape(b, (-1, 1)) | |
| 81 | + result = np.transpose(a - b).dot(l).dot(a-b)[0][0] | |
| 82 | + return result | |
| 83 | + | |
| 84 | + def _plot_iteration(self, iteration, points, clusters, centers): | |
| 85 | + fig = plt.figure() | |
| 86 | + ax = fig.add_subplot(111) | |
| 87 | + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) | |
| 88 | + | |
| 89 | + #for center in centers: | |
| 90 | + # ax.scatter(center[0], center[1], s=50, c='red', marker='+') | |
| 91 | + ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') | |
| 92 | + | |
| 93 | + ax.set_xlabel('x') | |
| 94 | + ax.set_ylabel('y') | |
| 95 | + plt.colorbar(scatter) | |
| 96 | + #plt.ylim(0, 1) | |
| 97 | + #plt.xlim(0, 1) | |
| 98 | + plt.savefig("test_" + str(iteration) + ".pdf") | |
| 99 | + | |
| 100 | + def _train(self, features, K: int): | |
| 101 | + X = features | |
| 102 | + N = X.shape[0] | |
| 103 | + d = X.shape[1] | |
| 104 | + | |
| 105 | + C, L = self._initialize_model(X, K) | |
| 106 | + self.C = C | |
| 107 | + self.L = L | |
| 108 | + self.K = K | |
| 109 | + | |
| 110 | + end_algo = False | |
| 111 | + i = 0 | |
| 112 | + while not end_algo: | |
| 113 | + if i == 10: | |
| 114 | + exit(1) | |
| 115 | + print("Iteration: ", i) | |
| 116 | + # Calcul matrix distance | |
| 117 | + distances = np.zeros((N, K)) | |
| 118 | + | |
| 119 | + for n in range(N): | |
| 120 | + for k in range(self.K): | |
| 121 | + distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) | |
| 122 | + print(distances) | |
| 123 | + closest_cluster = np.argmin(distances, axis=1) | |
| 124 | + if i % 1 == 0: | |
| 125 | + # -- Debug tool ---------------------- | |
| 126 | + # TSNE | |
| 127 | + #X_embedded = np.concatenate((X, self.C), axis=0) | |
| 128 | + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | |
| 129 | + # Then plot | |
| 130 | + self._plot_iteration( | |
| 131 | + i, | |
| 132 | + X_embedded[:X.shape[0]], | |
| 133 | + closest_cluster, | |
| 134 | + X_embedded[X.shape[0]:] | |
| 135 | + ) | |
| 136 | + # ------------------------------------ | |
| 137 | + | |
| 138 | + end_algo = True | |
| 139 | + for k in range(K): | |
| 140 | + # Find subset of X with values closed to the centroid c_k. | |
| 141 | + X_sub = np.where(closest_cluster == k) | |
| 142 | + X_sub = np.take(X, X_sub[0], axis=0) | |
| 143 | + np.mean(X_sub, axis=0) | |
| 144 | + C_new = np.mean(X_sub, axis=0) | |
| 145 | + | |
| 146 | + # -- COMPUTE NEW LAMBDA (here named K) -- | |
| 147 | + K_new = np.zeros((L.shape[1], L.shape[2])) | |
| 148 | + for x in X_sub: | |
| 149 | + x = np.reshape(x, (-1, 1)) | |
| 150 | + c_tmp = np.reshape(C_new, (-1, 1)) | |
| 151 | + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) | |
| 152 | + K_new = K_new / X_sub.shape[0] | |
| 153 | + K_new = np.linalg.inv(K_new) | |
| 154 | + | |
| 155 | + if end_algo and (not (self.C[k] == C_new).all()): # If the same stop | |
| 156 | + end_algo = False | |
| 157 | + self.C[k] = C_new | |
| 158 | + self.L[k] = K_new | |
| 159 | + i = i + 1 |