Adding constrained mahalanobis to help converging

quillotm
1 parent 88d1d67e9d
Showing 2 changed files with 23 additions and 12 deletions Side-by-side Diff
volia/clustering.py
volia/clustering_modules/kmeans_mahalanobis.py
@@ -17,7 +17,8 @@
  
 CLUSTERING_METHODS = {
     "k-means": kmeans(),
-    "k-means-mahalanobis": kmeansMahalanobis()
+    "k-means-mahalanobis": kmeansMahalanobis(),
+    "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)
 }
  
 EVALUATION_METHODS = {
@@ -8,13 +8,14 @@
 from abstract_clustering import AbstractClustering
  
 class kmeansMahalanobis():
-    def __init__(self):
+    def __init__(self, constrained: bool = False):
         """
  
         """
         self.C = None
         self.L = None
         self.K = None
+        self.constrained = constrained
  
     def predict(self, features):
         """
@@ -45,6 +46,7 @@
             self.C = data["C"]
             self.L = data["L"]
             self.K = data["K"]
+            self.constrained = data["constrained"]
  
     def save(self, modelpath: str):
         """
@@ -55,7 +57,8 @@
         data = {
             "C": self.C,
             "L": self.L,
-            "K": self.K
+            "K": self.K,
+            "constrained": self.constrained
         }
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
  
@@ -82,11 +85,11 @@
  
     def _dist(self, a, b, l):
         '''
-        Distance euclidienne
+        Distance euclidienne with mahalanobis
         '''
         a = np.reshape(a, (-1, 1))
         b = np.reshape(b, (-1, 1))
-        result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
+        result = np.transpose(a - b).dot(l).dot(a - b)[0][0]
         return result
  
     def _plot_iteration(self, iteration, points, clusters, centers):
  
  
@@ -129,17 +132,18 @@
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
  
             closest_cluster = np.argmin(distances, axis=1)
+
             loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
             if debug:
                 print(f"loss {loss}")
  
  
             # -- Debug tool ----------------------
-            if debug and i % 10 == 0:
+            if debug and i % 1 == 0:
                 # TSNE if needed
                 X_embedded = np.concatenate((X, self.C), axis=0)
                 if d > 2:
-                    X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+                    X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0))
  
                 # Then plot
                 self._plot_iteration(
  
  
  
@@ -151,22 +155,28 @@
             # ------------------------------------
  
             old_c = self.C.copy()
-            for k in range(K):
+            for k in range(self.K):
                 # Find subset of X with values closed to the centroid c_k.
                 X_sub = np.where(closest_cluster == k)
                 X_sub = np.take(X, X_sub[0], axis=0)
                 if X_sub.shape[0] == 0:
                     continue
-                np.mean(X_sub, axis=0)
+
                 C_new = np.mean(X_sub, axis=0)
  
                 # -- COMPUTE NEW LAMBDA (here named K) --
-                K_new = np.zeros((L.shape[1], L.shape[2]))
+                K_new = np.zeros((self.L.shape[1], self.L.shape[2]))
+                tmp = np.zeros((self.L.shape[1], self.L.shape[2]))
                 for x in X_sub:
                     x = np.reshape(x, (-1, 1))
                     c_tmp = np.reshape(C_new, (-1, 1))
-                    K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
-                K_new = K_new / X_sub.shape[0]
+                    #K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
+
+                    tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose())
+                if self.constrained:
+                    K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d)
+                else:
+                    K_new = tmp / X_sub.shape[0]
                 K_new = np.linalg.pinv(K_new)
  
                 #if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
...	...	@@ -17,7 +17,8 @@
17	17
18	18	CLUSTERING_METHODS = {
19	19	"k-means": kmeans(),
20		- "k-means-mahalanobis": kmeansMahalanobis()
	20	+ "k-means-mahalanobis": kmeansMahalanobis(),
	21	+ "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)
21	22	}
22	23
23	24	EVALUATION_METHODS = {
...	...	@@ -8,13 +8,14 @@
8	8	from abstract_clustering import AbstractClustering
9	9
10	10	class kmeansMahalanobis():
11		- def __init__(self):
	11	+ def __init__(self, constrained: bool = False):
12	12	"""
13	13
14	14	"""
15	15	self.C = None
16	16	self.L = None
17	17	self.K = None
	18	+ self.constrained = constrained
18	19
19	20	def predict(self, features):
20	21	"""
...	...	@@ -45,6 +46,7 @@
45	46	self.C = data["C"]
46	47	self.L = data["L"]
47	48	self.K = data["K"]
	49	+ self.constrained = data["constrained"]
48	50
49	51	def save(self, modelpath: str):
50	52	"""
...	...	@@ -55,7 +57,8 @@
55	57	data = {
56	58	"C": self.C,
57	59	"L": self.L,
58		- "K": self.K
	60	+ "K": self.K,
	61	+ "constrained": self.constrained
59	62	}
60	63	with open(modelpath, "wb") as f:
61	64	pickle.dump(data, f)
62	65
...	...	@@ -82,11 +85,11 @@
82	85
83	86	def _dist(self, a, b, l):
84	87	'''
85		- Distance euclidienne
	88	+ Distance euclidienne with mahalanobis
86	89	'''
87	90	a = np.reshape(a, (-1, 1))
88	91	b = np.reshape(b, (-1, 1))
89		- result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
	92	+ result = np.transpose(a - b).dot(l).dot(a - b)[0][0]
90	93	return result
91	94
92	95	def _plot_iteration(self, iteration, points, clusters, centers):
93	96
94	97
...	...	@@ -129,17 +132,18 @@
129	132	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
130	133
131	134	closest_cluster = np.argmin(distances, axis=1)
	135	+
132	136	loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
133	137	if debug:
134	138	print(f"loss {loss}")
135	139
136	140
137	141	# -- Debug tool ----------------------
138		- if debug and i % 10 == 0:
	142	+ if debug and i % 1 == 0:
139	143	# TSNE if needed
140	144	X_embedded = np.concatenate((X, self.C), axis=0)
141	145	if d > 2:
142		- X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
	146	+ X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0))
143	147
144	148	# Then plot
145	149	self._plot_iteration(
146	150
147	151
148	152
...	...	@@ -151,22 +155,28 @@
151	155	# ------------------------------------
152	156
153	157	old_c = self.C.copy()
154		- for k in range(K):
	158	+ for k in range(self.K):
155	159	# Find subset of X with values closed to the centroid c_k.
156	160	X_sub = np.where(closest_cluster == k)
157	161	X_sub = np.take(X, X_sub[0], axis=0)
158	162	if X_sub.shape[0] == 0:
159	163	continue
160		- np.mean(X_sub, axis=0)
	164	+
161	165	C_new = np.mean(X_sub, axis=0)
162	166
163	167	# -- COMPUTE NEW LAMBDA (here named K) --
164		- K_new = np.zeros((L.shape[1], L.shape[2]))
	168	+ K_new = np.zeros((self.L.shape[1], self.L.shape[2]))
	169	+ tmp = np.zeros((self.L.shape[1], self.L.shape[2]))
165	170	for x in X_sub:
166	171	x = np.reshape(x, (-1, 1))
167	172	c_tmp = np.reshape(C_new, (-1, 1))
168		- K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
169		- K_new = K_new / X_sub.shape[0]
	173	+ #K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
	174	+
	175	+ tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose())
	176	+ if self.constrained:
	177	+ K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d)
	178	+ else:
	179	+ K_new = tmp / X_sub.shape[0]
170	180	K_new = np.linalg.pinv(K_new)
171	181
172	182	#if end_algo and (not (self.C[k] == C_new).all()): # If the same stop