diff --git a/volia/clustering.py b/volia/clustering.py index e4abf5e..d6b96fd 100644 --- a/volia/clustering.py +++ b/volia/clustering.py @@ -17,7 +17,8 @@ import json CLUSTERING_METHODS = { "k-means": kmeans(), - "k-means-mahalanobis": kmeansMahalanobis() + "k-means-mahalanobis": kmeansMahalanobis(), + "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) } EVALUATION_METHODS = { diff --git a/volia/clustering_modules/kmeans_mahalanobis.py b/volia/clustering_modules/kmeans_mahalanobis.py index 2f3f2b5..2f84653 100644 --- a/volia/clustering_modules/kmeans_mahalanobis.py +++ b/volia/clustering_modules/kmeans_mahalanobis.py @@ -8,13 +8,14 @@ from sklearn.manifold import TSNE from abstract_clustering import AbstractClustering class kmeansMahalanobis(): - def __init__(self): + def __init__(self, constrained: bool = False): """ """ self.C = None self.L = None self.K = None + self.constrained = constrained def predict(self, features): """ @@ -45,6 +46,7 @@ class kmeansMahalanobis(): self.C = data["C"] self.L = data["L"] self.K = data["K"] + self.constrained = data["constrained"] def save(self, modelpath: str): """ @@ -55,7 +57,8 @@ class kmeansMahalanobis(): data = { "C": self.C, "L": self.L, - "K": self.K + "K": self.K, + "constrained": self.constrained } with open(modelpath, "wb") as f: pickle.dump(data, f) @@ -82,11 +85,11 @@ class kmeansMahalanobis(): def _dist(self, a, b, l): ''' - Distance euclidienne + Distance euclidienne with mahalanobis ''' a = np.reshape(a, (-1, 1)) b = np.reshape(b, (-1, 1)) - result = np.transpose(a - b).dot(l).dot(a-b)[0][0] + result = np.transpose(a - b).dot(l).dot(a - b)[0][0] return result def _plot_iteration(self, iteration, points, clusters, centers): @@ -129,17 +132,18 @@ class kmeansMahalanobis(): distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) closest_cluster = np.argmin(distances, axis=1) + loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) if debug: print(f"loss {loss}") # -- Debug tool ---------------------- - if debug and i % 10 == 0: + if debug and i % 1 == 0: # TSNE if needed X_embedded = np.concatenate((X, self.C), axis=0) if d > 2: - X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0)) # Then plot self._plot_iteration( @@ -151,22 +155,28 @@ class kmeansMahalanobis(): # ------------------------------------ old_c = self.C.copy() - for k in range(K): + for k in range(self.K): # Find subset of X with values closed to the centroid c_k. X_sub = np.where(closest_cluster == k) X_sub = np.take(X, X_sub[0], axis=0) if X_sub.shape[0] == 0: continue - np.mean(X_sub, axis=0) + C_new = np.mean(X_sub, axis=0) # -- COMPUTE NEW LAMBDA (here named K) -- - K_new = np.zeros((L.shape[1], L.shape[2])) + K_new = np.zeros((self.L.shape[1], self.L.shape[2])) + tmp = np.zeros((self.L.shape[1], self.L.shape[2])) for x in X_sub: x = np.reshape(x, (-1, 1)) c_tmp = np.reshape(C_new, (-1, 1)) - K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) - K_new = K_new / X_sub.shape[0] + #K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) + + tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose()) + if self.constrained: + K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d) + else: + K_new = tmp / X_sub.shape[0] K_new = np.linalg.pinv(K_new) #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop