Now, we can give more parameters to k-means command. Mahalanobis was tested and …

…seems to work well. Need more tests.

Now, we can give more parameters to k-means command. Mahalanobis was tested and …
…seems to work well. Need more tests.
quillotm
1 parent d4507c2683
Showing 3 changed files with 71 additions and 26 deletions Side-by-side Diff
volia/clustering.py
volia/clustering_modules/kmeans.py
volia/clustering_modules/kmeans_mahalanobis.py
@@ -67,7 +67,17 @@
     print(json.dumps(eval))
  
  
-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
+def kmeans_run(features: str,
+               lst: str,
+               k:int,
+               kmax: int,
+               klist,
+               maxiter: int,
+               ninit: int,
+               output: str,
+               tol: float,
+               debug: bool = False,
+               mahalanobis: str = False):
     """
  
     @param features: output features
  
  
@@ -92,12 +102,12 @@
  
     # Mono value case
     if kmax is None and klist is None:
-        print(f"Computing clustering with k={k}")
+        if debug:
+            print(f"Computing clustering with k={k}")
         model = CLUSTERING_METHODS["k-means"]
         if mahalanobis:
-            print("Computing with mahalanobis distance")
             model = CLUSTERING_METHODS["k-means-mahalanobis"]
-        model.fit(X, k)
+        model.fit(X, k, tol, maxiter, debug)
         model.save(output)
  
     # Multi values case with kmax
@@ -109,7 +119,7 @@
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, i)
+            model.fit(X, i, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
  
     # Second multi values case with klist
  
  
  
@@ -120,12 +130,17 @@
             k = int(k)
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
-                print("Computing with mahalanobis distance")
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, k)
+            model.fit(X, k, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
  
+    # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
+    # etc. (what distance, what parameters etc)
+    # TODO: Move example data into a directory.
+    # TODO: Add example receipts
+    # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
  
+
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
@@ -142,6 +157,19 @@
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
+    parser_kmeans.add_argument("--maxiter",
+                               type=int,
+                               default=300,
+                               help="Max number of iteration before stoping if not converging")
+    parser_kmeans.add_argument("--ninit",
+                               type=int,
+                               default=10,
+                               help="Number of time the k-means algorithm will be run with different centroid seeds.")
+    parser_kmeans.add_argument("--tol",
+                               type=float,
+                               default=0.0001,
+                               help="Tolerance to finish of distance between centroids and their updates.")
+    parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
@@ -33,12 +33,12 @@
         with open(model_path, "wb") as f:
             pickle.dump(self.kmeans_model, f)
  
-    def fit(self, features, k: int):
+    def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
         """
  
         @param features:
         @param k:
         @return:
         """
-        self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
+        self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
@@ -37,8 +37,8 @@
         @return:
         """
         data = None
-        with open(model_path):
-            data = pickle.load()
+        with open(model_path, "rb") as f:
+            data = pickle.load(f)
         if data is None:
             raise Exception("Le modèle n'a pas pu être chargé")
         else:
@@ -60,8 +60,8 @@
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
  
-    def fit(self, features, K: int):
-        self._train(features, K)
+    def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
+        self._train(features, k, tol, maxiter, debug)
  
     def _initialize_model(self, X, number_clusters):
         d = X.shape[1]
  
@@ -96,11 +96,12 @@
         #plt.xlim(0, 1)
         plt.savefig("test_" + str(iteration) + ".pdf")
  
-    def _train(self, features, K: int):
+    def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
         X = features
         N = X.shape[0]
         d = X.shape[1]
  
+        X_embedded = None
         C, L = self._initialize_model(X, K)
         self.C = C
         self.L = L
@@ -109,9 +110,9 @@
         end_algo = False
         i = 0
         while not end_algo:
-            if i == 10:
-                exit(1)
-            print("Iteration: ", i)
+            if debug:
+                print("Iteration: ", i)
+
             # Calcul matrix distance
             distances = np.zeros((N, K))
  
@@ -119,11 +120,14 @@
                 for k in range(self.K):
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
             closest_cluster = np.argmin(distances, axis=1)
-            if i % 1 == 0:
-                # -- Debug tool ----------------------
-                # TSNE
-                #X_embedded = np.concatenate((X, self.C), axis=0)
-                X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+
+            # -- Debug tool ----------------------
+            if debug and i % 10 == 0:
+                # TSNE if needed
+                X_embedded = np.concatenate((X, self.C), axis=0)
+                if d > 2:
+                    X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+
                 # Then plot
                 self._plot_iteration(
                     i,
  
@@ -131,9 +135,9 @@
                     closest_cluster,
                     X_embedded[X.shape[0]:]
                 )
-                # ------------------------------------
+            # ------------------------------------
  
-            end_algo = True
+            old_c = self.C.copy()
             for k in range(K):
                 # Find subset of X with values closed to the centroid c_k.
                 X_sub = np.where(closest_cluster == k)
  
  
@@ -152,9 +156,22 @@
                 K_new = K_new / X_sub.shape[0]
                 K_new = np.linalg.pinv(K_new)
  
-                if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
-                    end_algo = False
+                #if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
+                #    end_algo = False
                 self.C[k] = C_new
                 self.L[k] = K_new
+
+            diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
+            if diff > tol:
+                end_algo = False
+                if debug:
+                    print(f"{diff}")
+            elif debug:
+                print(f"Tolerance threshold {tol} reached with diff {diff}")
+                end_algo = True
             i = i + 1
+            if i > maxiter:
+                end_algo = True
+                if debug:
+                    print(f"Iteration {maxiter} reached")
...	...	@@ -67,7 +67,17 @@
67	67	print(json.dumps(eval))
68	68
69	69
70		-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
	70	+def kmeans_run(features: str,
	71	+ lst: str,
	72	+ k:int,
	73	+ kmax: int,
	74	+ klist,
	75	+ maxiter: int,
	76	+ ninit: int,
	77	+ output: str,
	78	+ tol: float,
	79	+ debug: bool = False,
	80	+ mahalanobis: str = False):
71	81	"""
72	82
73	83	@param features: output features
74	84
75	85
...	...	@@ -92,12 +102,12 @@
92	102
93	103	# Mono value case
94	104	if kmax is None and klist is None:
95		- print(f"Computing clustering with k={k}")
	105	+ if debug:
	106	+ print(f"Computing clustering with k={k}")
96	107	model = CLUSTERING_METHODS["k-means"]
97	108	if mahalanobis:
98		- print("Computing with mahalanobis distance")
99	109	model = CLUSTERING_METHODS["k-means-mahalanobis"]
100		- model.fit(X, k)
	110	+ model.fit(X, k, tol, maxiter, debug)
101	111	model.save(output)
102	112
103	113	# Multi values case with kmax
...	...	@@ -109,7 +119,7 @@
109	119	model = CLUSTERING_METHODS["k-means"]
110	120	if mahalanobis:
111	121	model = CLUSTERING_METHODS["k-means-mahalanobis"]
112		- model.fit(X, i)
	122	+ model.fit(X, i, tol, maxiter, debug)
113	123	model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
114	124
115	125	# Second multi values case with klist
116	126
117	127
118	128
...	...	@@ -120,12 +130,17 @@
120	130	k = int(k)
121	131	model = CLUSTERING_METHODS["k-means"]
122	132	if mahalanobis:
123		- print("Computing with mahalanobis distance")
124	133	model = CLUSTERING_METHODS["k-means-mahalanobis"]
125		- model.fit(X, k)
	134	+ model.fit(X, k, tol, maxiter, debug)
126	135	model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
127	136
	137	+ # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
	138	+ # etc. (what distance, what parameters etc)
	139	+ # TODO: Move example data into a directory.
	140	+ # TODO: Add example receipts
	141	+ # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
128	142
	143	+
129	144	if __name__ == "__main__":
130	145	# Main parser
131	146	parser = argparse.ArgumentParser(description="Clustering methods to apply")
...	...	@@ -142,6 +157,19 @@
142	157	parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
143	158	parser_kmeans.add_argument("--klist", nargs="+",
144	159	help="List of k values to test. As kmax, activate the multi values mod.")
	160	+ parser_kmeans.add_argument("--maxiter",
	161	+ type=int,
	162	+ default=300,
	163	+ help="Max number of iteration before stoping if not converging")
	164	+ parser_kmeans.add_argument("--ninit",
	165	+ type=int,
	166	+ default=10,
	167	+ help="Number of time the k-means algorithm will be run with different centroid seeds.")
	168	+ parser_kmeans.add_argument("--tol",
	169	+ type=float,
	170	+ default=0.0001,
	171	+ help="Tolerance to finish of distance between centroids and their updates.")
	172	+ parser_kmeans.add_argument("--debug", action="store_true")
145	173	parser_kmeans.add_argument("--output",
146	174	default=".kmeans",
147	175	help="output file if only k. Output directory if multiple kmax specified.")
...	...	@@ -33,12 +33,12 @@
33	33	with open(model_path, "wb") as f:
34	34	pickle.dump(self.kmeans_model, f)
35	35
36		- def fit(self, features, k: int):
	36	+ def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
37	37	"""
38	38
39	39	@param features:
40	40	@param k:
41	41	@return:
42	42	"""
43		- self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
	43	+ self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
...	...	@@ -37,8 +37,8 @@
37	37	@return:
38	38	"""
39	39	data = None
40		- with open(model_path):
41		- data = pickle.load()
	40	+ with open(model_path, "rb") as f:
	41	+ data = pickle.load(f)
42	42	if data is None:
43	43	raise Exception("Le modèle n'a pas pu être chargé")
44	44	else:
...	...	@@ -60,8 +60,8 @@
60	60	with open(modelpath, "wb") as f:
61	61	pickle.dump(data, f)
62	62
63		- def fit(self, features, K: int):
64		- self._train(features, K)
	63	+ def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
	64	+ self._train(features, k, tol, maxiter, debug)
65	65
66	66	def _initialize_model(self, X, number_clusters):
67	67	d = X.shape[1]
68	68
...	...	@@ -96,11 +96,12 @@
96	96	#plt.xlim(0, 1)
97	97	plt.savefig("test_" + str(iteration) + ".pdf")
98	98
99		- def _train(self, features, K: int):
	99	+ def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
100	100	X = features
101	101	N = X.shape[0]
102	102	d = X.shape[1]
103	103
	104	+ X_embedded = None
104	105	C, L = self._initialize_model(X, K)
105	106	self.C = C
106	107	self.L = L
...	...	@@ -109,9 +110,9 @@
109	110	end_algo = False
110	111	i = 0
111	112	while not end_algo:
112		- if i == 10:
113		- exit(1)
114		- print("Iteration: ", i)
	113	+ if debug:
	114	+ print("Iteration: ", i)
	115	+
115	116	# Calcul matrix distance
116	117	distances = np.zeros((N, K))
117	118
...	...	@@ -119,11 +120,14 @@
119	120	for k in range(self.K):
120	121	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
121	122	closest_cluster = np.argmin(distances, axis=1)
122		- if i % 1 == 0:
123		- # -- Debug tool ----------------------
124		- # TSNE
125		- #X_embedded = np.concatenate((X, self.C), axis=0)
126		- X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
	123	+
	124	+ # -- Debug tool ----------------------
	125	+ if debug and i % 10 == 0:
	126	+ # TSNE if needed
	127	+ X_embedded = np.concatenate((X, self.C), axis=0)
	128	+ if d > 2:
	129	+ X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
	130	+
127	131	# Then plot
128	132	self._plot_iteration(
129	133	i,
130	134
...	...	@@ -131,9 +135,9 @@
131	135	closest_cluster,
132	136	X_embedded[X.shape[0]:]
133	137	)
134		- # ------------------------------------
	138	+ # ------------------------------------
135	139
136		- end_algo = True
	140	+ old_c = self.C.copy()
137	141	for k in range(K):
138	142	# Find subset of X with values closed to the centroid c_k.
139	143	X_sub = np.where(closest_cluster == k)
140	144
141	145
...	...	@@ -152,9 +156,22 @@
152	156	K_new = K_new / X_sub.shape[0]
153	157	K_new = np.linalg.pinv(K_new)
154	158
155		- if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
156		- end_algo = False
	159	+ #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
	160	+ # end_algo = False
157	161	self.C[k] = C_new
158	162	self.L[k] = K_new
	163	+
	164	+ diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
	165	+ if diff > tol:
	166	+ end_algo = False
	167	+ if debug:
	168	+ print(f"{diff}")
	169	+ elif debug:
	170	+ print(f"Tolerance threshold {tol} reached with diff {diff}")
	171	+ end_algo = True
159	172	i = i + 1
	173	+ if i > maxiter:
	174	+ end_algo = True
	175	+ if debug:
	176	+ print(f"Iteration {maxiter} reached")