Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …

…some problems with identity matrix usage and infinite or nan values.

Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …
…some problems with identity matrix usage and infinite or nan values.
quillotm
1 parent 3c07f672ad
Showing 3 changed files with 216 additions and 17 deletions Side-by-side Diff
volia/clustering.py
volia/clustering_modules/kmeans.py
volia/clustering_modules/kmeans_mahalanobis.py
@@ -6,6 +6,7 @@
 from sklearn.cluster import KMeans
 import pickle
 from clustering_modules.kmeans import kmeans
+from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
  
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
@@ -15,7 +16,8 @@
  
  
 CLUSTERING_METHODS = {
-    "k-means": kmeans()
+    "k-means": kmeans(),
+    "k-means-mahalanobis": kmeansMahalanobis()
 }
  
 EVALUATION_METHODS = {
@@ -65,8 +67,7 @@
     print(json.dumps(eval))
  
  
-
-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
+def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
     """
  
     @param features: output features
@@ -75,6 +76,7 @@
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
+    @param mahalanobis: distance option of k-means.
     """
     # -- READ FILES --
     features_dict = read_features(features)
@@ -91,9 +93,12 @@
     # Mono value case
     if kmax is None and klist is None:
         print(f"Computing clustering with k={k}")
-        kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
-        preds = kmeans.predict(X)
-        pickle.dump(kmeans, open(output, "wb"))
+        model = CLUSTERING_METHODS["k-means"]
+        if mahalanobis:
+            print("Computing with mahalanobis distance")
+            model = CLUSTERING_METHODS["k-means-mahalanobis"]
+        model.fit(X, k)
+        model.save(output)
  
     # Multi values case with kmax
     if kmax is not None:
@@ -101,10 +106,11 @@
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
-            print(f"Computing clustering with k={i}")
-            kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
-            preds = kmeans.predict(X)
-            pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
+            model = CLUSTERING_METHODS["k-means"]
+            if mahalanobis:
+                model = CLUSTERING_METHODS["k-means-mahalanobis"]
+            model.fit(X, i)
+            model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
  
     # Second multi values case with klist
     if klist is not None:
@@ -112,10 +118,12 @@
             mkdir(output)
         for k in klist:
             k = int(k)
-            print(f"Computing clustering with k={k}")
-            kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
-            preds = kmeans.predict(X)
-            pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
+            model = CLUSTERING_METHODS["k-means"]
+            if mahalanobis:
+                print("Computing with mahalanobis distance")
+                model = CLUSTERING_METHODS["k-means-mahalanobis"]
+            model.fit(X, k)
+            model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
  
  
 if __name__ == "__main__":
@@ -134,7 +142,10 @@
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
-    parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
+    parser_kmeans.add_argument("--output",
+                               default=".kmeans",
+                               help="output file if only k. Output directory if multiple kmax specified.")
+    parser_kmeans.add_argument("--mahalanobis", action="store_true")
     parser_kmeans.set_defaults(which="kmeans")
  
     # measure
@@ -8,8 +8,37 @@
         self.kmeans_model = None
  
     def predict(self, features):
+        """
+
+        @param features:
+        @return:
+        """
         return self.kmeans_model.predict(features)
  
-    def load(self, model_path):
-        self.kmeans_model = pickle.load(open(model_path, "rb"))
+    def load(self, model_path: str):
+        """
+
+        @param model_path:
+        @return:
+        """
+        with open(model_path, "rb") as f:
+            self.kmeans_model = pickle.load(f)
+
+    def save(self, model_path: str):
+        """
+
+        @param model_path:
+        @return:
+        """
+        with open(model_path, "wb") as f:
+            pickle.dump(self.kmeans_model, f)
+
+    def fit(self, features, k: int):
+        """
+
+        @param features:
+        @param k:
+        @return:
+        """
+        self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
+
+
+from sklearn.cluster import KMeans
+import pickle
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+from abstract_clustering import AbstractClustering
+
+class kmeansMahalanobis():
+    def __init__(self):
+        """
+
+        """
+        self.C = None
+        self.L = None
+        self.K = None
+
+    def predict(self, features):
+        """
+
+        @param features:
+        @return:
+        """
+        N = features.shape[0]
+        distances = np.zeros((N, self.K))
+        for n in range(N):
+            for k in range(self.K):
+                distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
+        print(distances)
+        closest_cluster = np.argmin(distances, axis=1)
+        return closest_cluster
+
+    def load(self, model_path):
+        """
+
+        @param model_path:
+        @return:
+        """
+        data = None
+        with open(model_path):
+            data = pickle.load()
+        if data is None:
+            raise Exception("Le modèle n'a pas pu être chargé")
+        else:
+            self.C = data["C"]
+            self.L = data["L"]
+            self.K = data["K"]
+
+    def save(self, modelpath: str):
+        """
+
+        @param modelpath:
+        @return:
+        """
+        data = {
+            "C": self.C,
+            "L": self.L,
+            "K": self.K
+        }
+        with open(modelpath, "wb") as f:
+            pickle.dump(data, f)
+
+    def fit(self, features, K: int):
+        self._train(features, K)
+
+    def _initialize_model(self, X, number_clusters):
+        d = X.shape[1]
+        C = X[np.random.choice(X.shape[0], number_clusters)]
+        L = np.zeros((number_clusters, d, d))
+        for k in range(number_clusters):
+            L[k] = np.identity(d)
+        return C, L
+
+    def _dist(self, a, b, l):
+        '''
+        Distance euclidienne
+        '''
+        a = np.reshape(a, (-1, 1))
+        b = np.reshape(b, (-1, 1))
+        result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
+        return result
+
+    def _plot_iteration(self, iteration, points, clusters, centers):
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
+
+        #for center in centers:
+        #    ax.scatter(center[0], center[1], s=50, c='red', marker='+')
+        ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
+
+        ax.set_xlabel('x')
+        ax.set_ylabel('y')
+        plt.colorbar(scatter)
+        #plt.ylim(0, 1)
+        #plt.xlim(0, 1)
+        plt.savefig("test_" + str(iteration) + ".pdf")
+
+    def _train(self, features, K: int):
+        X = features
+        N = X.shape[0]
+        d = X.shape[1]
+
+        C, L = self._initialize_model(X, K)
+        self.C = C
+        self.L = L
+        self.K = K
+
+        end_algo = False
+        i = 0
+        while not end_algo:
+            if i == 10:
+                exit(1)
+            print("Iteration: ", i)
+            # Calcul matrix distance
+            distances = np.zeros((N, K))
+
+            for n in range(N):
+                for k in range(self.K):
+                    distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
+            print(distances)
+            closest_cluster = np.argmin(distances, axis=1)
+            if i % 1 == 0:
+                # -- Debug tool ----------------------
+                # TSNE
+                #X_embedded = np.concatenate((X, self.C), axis=0)
+                X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
+                # Then plot
+                self._plot_iteration(
+                    i,
+                    X_embedded[:X.shape[0]],
+                    closest_cluster,
+                    X_embedded[X.shape[0]:]
+                )
+                # ------------------------------------
+
+            end_algo = True
+            for k in range(K):
+                # Find subset of X with values closed to the centroid c_k.
+                X_sub = np.where(closest_cluster == k)
+                X_sub = np.take(X, X_sub[0], axis=0)
+                np.mean(X_sub, axis=0)
+                C_new = np.mean(X_sub, axis=0)
+
+                # -- COMPUTE NEW LAMBDA (here named K) --
+                K_new = np.zeros((L.shape[1], L.shape[2]))
+                for x in X_sub:
+                    x = np.reshape(x, (-1, 1))
+                    c_tmp = np.reshape(C_new, (-1, 1))
+                    K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
+                K_new = K_new / X_sub.shape[0]
+                K_new = np.linalg.inv(K_new)
+
+                if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
+                    end_algo = False
+                self.C[k] = C_new
+                self.L[k] = K_new
+            i = i + 1
...	...	@@ -6,6 +6,7 @@
6	6	from sklearn.cluster import KMeans
7	7	import pickle
8	8	from clustering_modules.kmeans import kmeans
	9	+from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
9	10
10	11	from sklearn.preprocessing import LabelEncoder
11	12	from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
...	...	@@ -15,7 +16,8 @@
15	16
16	17
17	18	CLUSTERING_METHODS = {
18		- "k-means": kmeans()
	19	+ "k-means": kmeans(),
	20	+ "k-means-mahalanobis": kmeansMahalanobis()
19	21	}
20	22
21	23	EVALUATION_METHODS = {
...	...	@@ -65,8 +67,7 @@
65	67	print(json.dumps(eval))
66	68
67	69
68		-
69		-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
	70	+def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
70	71	"""
71	72
72	73	@param features: output features
...	...	@@ -75,6 +76,7 @@
75	76	@param kmax: maximum k to compute
76	77	@param klist: list of k values to compute, ignore k value
77	78	@param output: output file if kmax not specified, else, output directory
	79	+ @param mahalanobis: distance option of k-means.
78	80	"""
79	81	# -- READ FILES --
80	82	features_dict = read_features(features)
...	...	@@ -91,9 +93,12 @@
91	93	# Mono value case
92	94	if kmax is None and klist is None:
93	95	print(f"Computing clustering with k={k}")
94		- kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
95		- preds = kmeans.predict(X)
96		- pickle.dump(kmeans, open(output, "wb"))
	96	+ model = CLUSTERING_METHODS["k-means"]
	97	+ if mahalanobis:
	98	+ print("Computing with mahalanobis distance")
	99	+ model = CLUSTERING_METHODS["k-means-mahalanobis"]
	100	+ model.fit(X, k)
	101	+ model.save(output)
97	102
98	103	# Multi values case with kmax
99	104	if kmax is not None:
...	...	@@ -101,10 +106,11 @@
101	106	mkdir(output)
102	107	Ks = range(k, kmax + 1)
103	108	for i in Ks:
104		- print(f"Computing clustering with k={i}")
105		- kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
106		- preds = kmeans.predict(X)
107		- pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
	109	+ model = CLUSTERING_METHODS["k-means"]
	110	+ if mahalanobis:
	111	+ model = CLUSTERING_METHODS["k-means-mahalanobis"]
	112	+ model.fit(X, i)
	113	+ model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
108	114
109	115	# Second multi values case with klist
110	116	if klist is not None:
...	...	@@ -112,10 +118,12 @@
112	118	mkdir(output)
113	119	for k in klist:
114	120	k = int(k)
115		- print(f"Computing clustering with k={k}")
116		- kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
117		- preds = kmeans.predict(X)
118		- pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
	121	+ model = CLUSTERING_METHODS["k-means"]
	122	+ if mahalanobis:
	123	+ print("Computing with mahalanobis distance")
	124	+ model = CLUSTERING_METHODS["k-means-mahalanobis"]
	125	+ model.fit(X, k)
	126	+ model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
119	127
120	128
121	129	if __name__ == "__main__":
...	...	@@ -134,7 +142,10 @@
134	142	parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
135	143	parser_kmeans.add_argument("--klist", nargs="+",
136	144	help="List of k values to test. As kmax, activate the multi values mod.")
137		- parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
	145	+ parser_kmeans.add_argument("--output",
	146	+ default=".kmeans",
	147	+ help="output file if only k. Output directory if multiple kmax specified.")
	148	+ parser_kmeans.add_argument("--mahalanobis", action="store_true")
138	149	parser_kmeans.set_defaults(which="kmeans")
139	150
140	151	# measure
...	...	@@ -8,8 +8,37 @@
8	8	self.kmeans_model = None
9	9
10	10	def predict(self, features):
	11	+ """
	12	+
	13	+ @param features:
	14	+ @return:
	15	+ """
11	16	return self.kmeans_model.predict(features)
12	17
13		- def load(self, model_path):
14		- self.kmeans_model = pickle.load(open(model_path, "rb"))
	18	+ def load(self, model_path: str):
	19	+ """
	20	+
	21	+ @param model_path:
	22	+ @return:
	23	+ """
	24	+ with open(model_path, "rb") as f:
	25	+ self.kmeans_model = pickle.load(f)
	26	+
	27	+ def save(self, model_path: str):
	28	+ """
	29	+
	30	+ @param model_path:
	31	+ @return:
	32	+ """
	33	+ with open(model_path, "wb") as f:
	34	+ pickle.dump(self.kmeans_model, f)
	35	+
	36	+ def fit(self, features, k: int):
	37	+ """
	38	+
	39	+ @param features:
	40	+ @param k:
	41	+ @return:
	42	+ """
	43	+ self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
	1	+
	2	+
	3	+from sklearn.cluster import KMeans
	4	+import pickle
	5	+import numpy as np
	6	+import matplotlib.pyplot as plt
	7	+from sklearn.manifold import TSNE
	8	+from abstract_clustering import AbstractClustering
	9	+
	10	+class kmeansMahalanobis():
	11	+ def __init__(self):
	12	+ """
	13	+
	14	+ """
	15	+ self.C = None
	16	+ self.L = None
	17	+ self.K = None
	18	+
	19	+ def predict(self, features):
	20	+ """
	21	+
	22	+ @param features:
	23	+ @return:
	24	+ """
	25	+ N = features.shape[0]
	26	+ distances = np.zeros((N, self.K))
	27	+ for n in range(N):
	28	+ for k in range(self.K):
	29	+ distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
	30	+ print(distances)
	31	+ closest_cluster = np.argmin(distances, axis=1)
	32	+ return closest_cluster
	33	+
	34	+ def load(self, model_path):
	35	+ """
	36	+
	37	+ @param model_path:
	38	+ @return:
	39	+ """
	40	+ data = None
	41	+ with open(model_path):
	42	+ data = pickle.load()
	43	+ if data is None:
	44	+ raise Exception("Le modèle n'a pas pu être chargé")
	45	+ else:
	46	+ self.C = data["C"]
	47	+ self.L = data["L"]
	48	+ self.K = data["K"]
	49	+
	50	+ def save(self, modelpath: str):
	51	+ """
	52	+
	53	+ @param modelpath:
	54	+ @return:
	55	+ """
	56	+ data = {
	57	+ "C": self.C,
	58	+ "L": self.L,
	59	+ "K": self.K
	60	+ }
	61	+ with open(modelpath, "wb") as f:
	62	+ pickle.dump(data, f)
	63	+
	64	+ def fit(self, features, K: int):
	65	+ self._train(features, K)
	66	+
	67	+ def _initialize_model(self, X, number_clusters):
	68	+ d = X.shape[1]
	69	+ C = X[np.random.choice(X.shape[0], number_clusters)]
	70	+ L = np.zeros((number_clusters, d, d))
	71	+ for k in range(number_clusters):
	72	+ L[k] = np.identity(d)
	73	+ return C, L
	74	+
	75	+ def _dist(self, a, b, l):
	76	+ '''
	77	+ Distance euclidienne
	78	+ '''
	79	+ a = np.reshape(a, (-1, 1))
	80	+ b = np.reshape(b, (-1, 1))
	81	+ result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
	82	+ return result
	83	+
	84	+ def _plot_iteration(self, iteration, points, clusters, centers):
	85	+ fig = plt.figure()
	86	+ ax = fig.add_subplot(111)
	87	+ scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
	88	+
	89	+ #for center in centers:
	90	+ # ax.scatter(center[0], center[1], s=50, c='red', marker='+')
	91	+ ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
	92	+
	93	+ ax.set_xlabel('x')
	94	+ ax.set_ylabel('y')
	95	+ plt.colorbar(scatter)
	96	+ #plt.ylim(0, 1)
	97	+ #plt.xlim(0, 1)
	98	+ plt.savefig("test_" + str(iteration) + ".pdf")
	99	+
	100	+ def _train(self, features, K: int):
	101	+ X = features
	102	+ N = X.shape[0]
	103	+ d = X.shape[1]
	104	+
	105	+ C, L = self._initialize_model(X, K)
	106	+ self.C = C
	107	+ self.L = L
	108	+ self.K = K
	109	+
	110	+ end_algo = False
	111	+ i = 0
	112	+ while not end_algo:
	113	+ if i == 10:
	114	+ exit(1)
	115	+ print("Iteration: ", i)
	116	+ # Calcul matrix distance
	117	+ distances = np.zeros((N, K))
	118	+
	119	+ for n in range(N):
	120	+ for k in range(self.K):
	121	+ distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
	122	+ print(distances)
	123	+ closest_cluster = np.argmin(distances, axis=1)
	124	+ if i % 1 == 0:
	125	+ # -- Debug tool ----------------------
	126	+ # TSNE
	127	+ #X_embedded = np.concatenate((X, self.C), axis=0)
	128	+ X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
	129	+ # Then plot
	130	+ self._plot_iteration(
	131	+ i,
	132	+ X_embedded[:X.shape[0]],
	133	+ closest_cluster,
	134	+ X_embedded[X.shape[0]:]
	135	+ )
	136	+ # ------------------------------------
	137	+
	138	+ end_algo = True
	139	+ for k in range(K):
	140	+ # Find subset of X with values closed to the centroid c_k.
	141	+ X_sub = np.where(closest_cluster == k)
	142	+ X_sub = np.take(X, X_sub[0], axis=0)
	143	+ np.mean(X_sub, axis=0)
	144	+ C_new = np.mean(X_sub, axis=0)
	145	+
	146	+ # -- COMPUTE NEW LAMBDA (here named K) --
	147	+ K_new = np.zeros((L.shape[1], L.shape[2]))
	148	+ for x in X_sub:
	149	+ x = np.reshape(x, (-1, 1))
	150	+ c_tmp = np.reshape(C_new, (-1, 1))
	151	+ K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
	152	+ K_new = K_new / X_sub.shape[0]
	153	+ K_new = np.linalg.inv(K_new)
	154	+
	155	+ if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
	156	+ end_algo = False
	157	+ self.C[k] = C_new
	158	+ self.L[k] = K_new
	159	+ i = i + 1