Adding n init parameters

quillotm
1 parent 78e6974959
Showing 2 changed files with 53 additions and 28 deletions Side-by-side Diff
volia/clustering.py
volia/clustering_modules/kmeans_mahalanobis.py
@@ -88,6 +88,25 @@
     @param output: output file if kmax not specified, else, output directory
     @param mahalanobis: distance option of k-means.
     """
+    json_content = locals().copy()
+
+    def fit_model(k: int, output_file):
+        if debug:
+            print(f"Computing clustering with k={k}")
+        model = CLUSTERING_METHODS["k-means"]
+        if mahalanobis:
+            if debug:
+                print("Mahalanobis activated")
+            model = CLUSTERING_METHODS["k-means-mahalanobis"]
+        model.fit(X, k, tol, ninit, maxiter, debug)
+        model.save(output_file)
+        json_content["models"].append({
+            "model_file": output_file,
+            "k": k,
+        })
+
+    json_content["models"] = []
+
     # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
@@ -102,13 +121,7 @@
  
     # Mono value case
     if kmax is None and klist is None:
-        if debug:
-            print(f"Computing clustering with k={k}")
-        model = CLUSTERING_METHODS["k-means"]
-        if mahalanobis:
-            model = CLUSTERING_METHODS["k-means-mahalanobis"]
-        model.fit(X, k, tol, maxiter, debug)
-        model.save(output)
+        fit_model(k, output)
  
     # Multi values case with kmax
     if kmax is not None:
@@ -116,11 +129,7 @@
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
-            model = CLUSTERING_METHODS["k-means"]
-            if mahalanobis:
-                model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, i, tol, maxiter, debug)
-            model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
+            fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
  
     # Second multi values case with klist
     if klist is not None:
  
@@ -128,16 +137,10 @@
             mkdir(output)
         for k in klist:
             k = int(k)
-            model = CLUSTERING_METHODS["k-means"]
-            if mahalanobis:
-                model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, k, tol, maxiter, debug)
-            model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
+            fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
  
-    # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
-    # etc. (what distance, what parameters etc)
-    # TODO: Move example data into a directory.
-    # TODO: Add example receipts
+    print(json_content)
+    # TODO: compute loss with k-means mahalanobis.
     # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
  
  
@@ -60,8 +60,17 @@
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
  
-    def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
-        self._train(features, k, tol, maxiter, debug)
+    def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
+        results = []
+        for i in range(ninit):
+            results.append(self._train(features, k, tol, maxiter, debug))
+        losses = [v["loss"] for v in results]
+        best = results[losses.index(min(losses))]
+        if debug:
+            print(f"best: {best['loss']} loss")
+        self.C = best["C"]
+        self.L = best["L"]
+        self.K = best["K"]
  
     def _initialize_model(self, X, number_clusters):
         d = X.shape[1]
@@ -101,7 +110,6 @@
         N = X.shape[0]
         d = X.shape[1]
  
-        X_embedded = None
         C, L = self._initialize_model(X, K)
         self.C = C
         self.L = L
  
  
  
@@ -114,13 +122,18 @@
                 print("Iteration: ", i)
  
             # Calcul matrix distance
-            distances = np.zeros((N, K))
+            distances = np.zeros((N, self.K))
  
             for n in range(N):
                 for k in range(self.K):
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
+
             closest_cluster = np.argmin(distances, axis=1)
+            loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
+            if debug:
+                print(f"loss {loss}")
  
+
             # -- Debug tool ----------------------
             if debug and i % 10 == 0:
                 # TSNE if needed
  
  
  
@@ -161,17 +174,26 @@
                 self.C[k] = C_new
                 self.L[k] = K_new
  
+
             diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
             if diff > tol:
                 end_algo = False
                 if debug:
                     print(f"{diff}")
-            elif debug:
-                print(f"Tolerance threshold {tol} reached with diff {diff}")
+            else:
+                if debug:
+                    print(f"Tolerance threshold {tol} reached with diff {diff}")
                 end_algo = True
+
             i = i + 1
             if i > maxiter:
                 end_algo = True
                 if debug:
                     print(f"Iteration {maxiter} reached")
+        return {
+            "loss": loss,
+            "C": self.C,
+            "K": self.K,
+            "L": self.L
+        }
...	...	@@ -88,6 +88,25 @@
88	88	@param output: output file if kmax not specified, else, output directory
89	89	@param mahalanobis: distance option of k-means.
90	90	"""
	91	+ json_content = locals().copy()
	92	+
	93	+ def fit_model(k: int, output_file):
	94	+ if debug:
	95	+ print(f"Computing clustering with k={k}")
	96	+ model = CLUSTERING_METHODS["k-means"]
	97	+ if mahalanobis:
	98	+ if debug:
	99	+ print("Mahalanobis activated")
	100	+ model = CLUSTERING_METHODS["k-means-mahalanobis"]
	101	+ model.fit(X, k, tol, ninit, maxiter, debug)
	102	+ model.save(output_file)
	103	+ json_content["models"].append({
	104	+ "model_file": output_file,
	105	+ "k": k,
	106	+ })
	107	+
	108	+ json_content["models"] = []
	109	+
91	110	# -- READ FILES --
92	111	features_dict = read_features(features)
93	112	lst_dict = read_lst(lst)
...	...	@@ -102,13 +121,7 @@
102	121
103	122	# Mono value case
104	123	if kmax is None and klist is None:
105		- if debug:
106		- print(f"Computing clustering with k={k}")
107		- model = CLUSTERING_METHODS["k-means"]
108		- if mahalanobis:
109		- model = CLUSTERING_METHODS["k-means-mahalanobis"]
110		- model.fit(X, k, tol, maxiter, debug)
111		- model.save(output)
	124	+ fit_model(k, output)
112	125
113	126	# Multi values case with kmax
114	127	if kmax is not None:
...	...	@@ -116,11 +129,7 @@
116	129	mkdir(output)
117	130	Ks = range(k, kmax + 1)
118	131	for i in Ks:
119		- model = CLUSTERING_METHODS["k-means"]
120		- if mahalanobis:
121		- model = CLUSTERING_METHODS["k-means-mahalanobis"]
122		- model.fit(X, i, tol, maxiter, debug)
123		- model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
	132	+ fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
124	133
125	134	# Second multi values case with klist
126	135	if klist is not None:
127	136
...	...	@@ -128,16 +137,10 @@
128	137	mkdir(output)
129	138	for k in klist:
130	139	k = int(k)
131		- model = CLUSTERING_METHODS["k-means"]
132		- if mahalanobis:
133		- model = CLUSTERING_METHODS["k-means-mahalanobis"]
134		- model.fit(X, k, tol, maxiter, debug)
135		- model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
	140	+ fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
136	141
137		- # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
138		- # etc. (what distance, what parameters etc)
139		- # TODO: Move example data into a directory.
140		- # TODO: Add example receipts
	142	+ print(json_content)
	143	+ # TODO: compute loss with k-means mahalanobis.
141	144	# TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
142	145
143	146
...	...	@@ -60,8 +60,17 @@
60	60	with open(modelpath, "wb") as f:
61	61	pickle.dump(data, f)
62	62
63		- def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
64		- self._train(features, k, tol, maxiter, debug)
	63	+ def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
	64	+ results = []
	65	+ for i in range(ninit):
	66	+ results.append(self._train(features, k, tol, maxiter, debug))
	67	+ losses = [v["loss"] for v in results]
	68	+ best = results[losses.index(min(losses))]
	69	+ if debug:
	70	+ print(f"best: {best['loss']} loss")
	71	+ self.C = best["C"]
	72	+ self.L = best["L"]
	73	+ self.K = best["K"]
65	74
66	75	def _initialize_model(self, X, number_clusters):
67	76	d = X.shape[1]
...	...	@@ -101,7 +110,6 @@
101	110	N = X.shape[0]
102	111	d = X.shape[1]
103	112
104		- X_embedded = None
105	113	C, L = self._initialize_model(X, K)
106	114	self.C = C
107	115	self.L = L
108	116
109	117
110	118
...	...	@@ -114,13 +122,18 @@
114	122	print("Iteration: ", i)
115	123
116	124	# Calcul matrix distance
117		- distances = np.zeros((N, K))
	125	+ distances = np.zeros((N, self.K))
118	126
119	127	for n in range(N):
120	128	for k in range(self.K):
121	129	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
	130	+
122	131	closest_cluster = np.argmin(distances, axis=1)
	132	+ loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
	133	+ if debug:
	134	+ print(f"loss {loss}")
123	135
	136	+
124	137	# -- Debug tool ----------------------
125	138	if debug and i % 10 == 0:
126	139	# TSNE if needed
127	140
128	141
129	142
...	...	@@ -161,17 +174,26 @@
161	174	self.C[k] = C_new
162	175	self.L[k] = K_new
163	176
	177	+
164	178	diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
165	179	if diff > tol:
166	180	end_algo = False
167	181	if debug:
168	182	print(f"{diff}")
169		- elif debug:
170		- print(f"Tolerance threshold {tol} reached with diff {diff}")
	183	+ else:
	184	+ if debug:
	185	+ print(f"Tolerance threshold {tol} reached with diff {diff}")
171	186	end_algo = True
	187	+
172	188	i = i + 1
173	189	if i > maxiter:
174	190	end_algo = True
175	191	if debug:
176	192	print(f"Iteration {maxiter} reached")
	193	+ return {
	194	+ "loss": loss,
	195	+ "C": self.C,
	196	+ "K": self.K,
	197	+ "L": self.L
	198	+ }