Commit ed89325d5d02f6e7878e3fd52498c8ad1ca653be

Authored by quillotm
1 parent d4507c2683
Exists in master

Now, we can give more parameters to k-means command. Mahalanobis was tested and …

…seems to work well. Need more tests.

Showing 3 changed files with 71 additions and 26 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis 9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
10 10
11 from sklearn.preprocessing import LabelEncoder 11 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
13 13
14 import core.measures 14 import core.measures
15 import json 15 import json
16 16
17 17
18 CLUSTERING_METHODS = { 18 CLUSTERING_METHODS = {
19 "k-means": kmeans(), 19 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis() 20 "k-means-mahalanobis": kmeansMahalanobis()
21 } 21 }
22 22
23 EVALUATION_METHODS = { 23 EVALUATION_METHODS = {
24 "entropy": core.measures.entropy_score, 24 "entropy": core.measures.entropy_score,
25 "purity": core.measures.purity_score, 25 "purity": core.measures.purity_score,
26 "v-measure": v_measure_score, 26 "v-measure": v_measure_score,
27 "homogeneity": homogeneity_score, 27 "homogeneity": homogeneity_score,
28 "completeness": completeness_score, 28 "completeness": completeness_score,
29 } 29 }
30 30
31 31
32 def disequilibrium_run(): 32 def disequilibrium_run():
33 pass 33 pass
34 34
35 35
36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
37 """ 37 """
38 38
39 @param measure: 39 @param measure:
40 @param features: 40 @param features:
41 @param lst: 41 @param lst:
42 @param truelabels: 42 @param truelabels:
43 @param model: 43 @param model:
44 @param modeltype: 44 @param modeltype:
45 @return: 45 @return:
46 """ 46 """
47 module = CLUSTERING_METHODS[modeltype] 47 module = CLUSTERING_METHODS[modeltype]
48 module.load(model) 48 module.load(model)
49 49
50 eval = {} 50 eval = {}
51 for ms in measure: 51 for ms in measure:
52 evaluation = EVALUATION_METHODS[ms] 52 evaluation = EVALUATION_METHODS[ms]
53 feats_dict = read_features(features) 53 feats_dict = read_features(features)
54 labels_dict = read_labels(truelabels) 54 labels_dict = read_labels(truelabels)
55 lst_dict = read_lst(lst) 55 lst_dict = read_lst(lst)
56 lst_keys = [key for key in lst_dict] 56 lst_keys = [key for key in lst_dict]
57 feats = np.asarray([feats_dict[key] for key in lst_keys]) 57 feats = np.asarray([feats_dict[key] for key in lst_keys])
58 Y_pred = module.predict(feats) 58 Y_pred = module.predict(feats)
59 Y_truth = [labels_dict[key][0] for key in lst_keys] 59 Y_truth = [labels_dict[key][0] for key in lst_keys]
60 60
61 le = LabelEncoder() 61 le = LabelEncoder()
62 le.fit(Y_truth) 62 le.fit(Y_truth)
63 Y_truth = le.transform(Y_truth) 63 Y_truth = le.transform(Y_truth)
64 64
65 eval[ms] = evaluation(Y_truth, Y_pred) 65 eval[ms] = evaluation(Y_truth, Y_pred)
66 66
67 print(json.dumps(eval)) 67 print(json.dumps(eval))
68 68
69 69
70 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): 70 def kmeans_run(features: str,
71 lst: str,
72 k:int,
73 kmax: int,
74 klist,
75 maxiter: int,
76 ninit: int,
77 output: str,
78 tol: float,
79 debug: bool = False,
80 mahalanobis: str = False):
71 """ 81 """
72 82
73 @param features: output features 83 @param features: output features
74 @param lst: list file 84 @param lst: list file
75 @param k: k (kmin if kmax specified) 85 @param k: k (kmin if kmax specified)
76 @param kmax: maximum k to compute 86 @param kmax: maximum k to compute
77 @param klist: list of k values to compute, ignore k value 87 @param klist: list of k values to compute, ignore k value
78 @param output: output file if kmax not specified, else, output directory 88 @param output: output file if kmax not specified, else, output directory
79 @param mahalanobis: distance option of k-means. 89 @param mahalanobis: distance option of k-means.
80 """ 90 """
81 # -- READ FILES -- 91 # -- READ FILES --
82 features_dict = read_features(features) 92 features_dict = read_features(features)
83 lst_dict = read_lst(lst) 93 lst_dict = read_lst(lst)
84 X = np.asarray([features_dict[x] for x in lst_dict]) 94 X = np.asarray([features_dict[x] for x in lst_dict])
85 95
86 # Exception cases 96 # Exception cases
87 if kmax is None and klist is None and path.isdir(output): 97 if kmax is None and klist is None and path.isdir(output):
88 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 98 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
89 99
90 if (kmax is not None or klist is not None) and path.isfile(output): 100 if (kmax is not None or klist is not None) and path.isfile(output):
91 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 101 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
92 102
93 # Mono value case 103 # Mono value case
94 if kmax is None and klist is None: 104 if kmax is None and klist is None:
95 print(f"Computing clustering with k={k}") 105 if debug:
106 print(f"Computing clustering with k={k}")
96 model = CLUSTERING_METHODS["k-means"] 107 model = CLUSTERING_METHODS["k-means"]
97 if mahalanobis: 108 if mahalanobis:
98 print("Computing with mahalanobis distance")
99 model = CLUSTERING_METHODS["k-means-mahalanobis"] 109 model = CLUSTERING_METHODS["k-means-mahalanobis"]
100 model.fit(X, k) 110 model.fit(X, k, tol, maxiter, debug)
101 model.save(output) 111 model.save(output)
102 112
103 # Multi values case with kmax 113 # Multi values case with kmax
104 if kmax is not None: 114 if kmax is not None:
105 if not path.isdir(output): 115 if not path.isdir(output):
106 mkdir(output) 116 mkdir(output)
107 Ks = range(k, kmax + 1) 117 Ks = range(k, kmax + 1)
108 for i in Ks: 118 for i in Ks:
109 model = CLUSTERING_METHODS["k-means"] 119 model = CLUSTERING_METHODS["k-means"]
110 if mahalanobis: 120 if mahalanobis:
111 model = CLUSTERING_METHODS["k-means-mahalanobis"] 121 model = CLUSTERING_METHODS["k-means-mahalanobis"]
112 model.fit(X, i) 122 model.fit(X, i, tol, maxiter, debug)
113 model.save(path.join(output, "clustering_" + str(i) + ".pkl")) 123 model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
114 124
115 # Second multi values case with klist 125 # Second multi values case with klist
116 if klist is not None: 126 if klist is not None:
117 if not path.isdir(output): 127 if not path.isdir(output):
118 mkdir(output) 128 mkdir(output)
119 for k in klist: 129 for k in klist:
120 k = int(k) 130 k = int(k)
121 model = CLUSTERING_METHODS["k-means"] 131 model = CLUSTERING_METHODS["k-means"]
122 if mahalanobis: 132 if mahalanobis:
123 print("Computing with mahalanobis distance")
124 model = CLUSTERING_METHODS["k-means-mahalanobis"] 133 model = CLUSTERING_METHODS["k-means-mahalanobis"]
125 model.fit(X, k) 134 model.fit(X, k, tol, maxiter, debug)
126 model.save(path.join(output, "clustering_" + str(k) + ".pkl")) 135 model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
127 136
137 # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
138 # etc. (what distance, what parameters etc)
139 # TODO: Move example data into a directory.
140 # TODO: Add example receipts
141 # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
128 142
143
129 if __name__ == "__main__": 144 if __name__ == "__main__":
130 # Main parser 145 # Main parser
131 parser = argparse.ArgumentParser(description="Clustering methods to apply") 146 parser = argparse.ArgumentParser(description="Clustering methods to apply")
132 subparsers = parser.add_subparsers(title="action") 147 subparsers = parser.add_subparsers(title="action")
133 148
134 # kmeans 149 # kmeans
135 parser_kmeans = subparsers.add_parser( 150 parser_kmeans = subparsers.add_parser(
136 "kmeans", help="Compute clustering using k-means algorithm") 151 "kmeans", help="Compute clustering using k-means algorithm")
137 152
138 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 153 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
139 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 154 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
140 parser_kmeans.add_argument("-k", default=2, type=int, 155 parser_kmeans.add_argument("-k", default=2, type=int,
141 help="number of clusters to compute. It is kmin if kmax is specified.") 156 help="number of clusters to compute. It is kmin if kmax is specified.")
142 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 157 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
143 parser_kmeans.add_argument("--klist", nargs="+", 158 parser_kmeans.add_argument("--klist", nargs="+",
144 help="List of k values to test. As kmax, activate the multi values mod.") 159 help="List of k values to test. As kmax, activate the multi values mod.")
160 parser_kmeans.add_argument("--maxiter",
161 type=int,
162 default=300,
163 help="Max number of iteration before stoping if not converging")
164 parser_kmeans.add_argument("--ninit",
165 type=int,
166 default=10,
167 help="Number of time the k-means algorithm will be run with different centroid seeds.")
168 parser_kmeans.add_argument("--tol",
169 type=float,
170 default=0.0001,
171 help="Tolerance to finish of distance between centroids and their updates.")
172 parser_kmeans.add_argument("--debug", action="store_true")
145 parser_kmeans.add_argument("--output", 173 parser_kmeans.add_argument("--output",
146 default=".kmeans", 174 default=".kmeans",
147 help="output file if only k. Output directory if multiple kmax specified.") 175 help="output file if only k. Output directory if multiple kmax specified.")
148 parser_kmeans.add_argument("--mahalanobis", action="store_true") 176 parser_kmeans.add_argument("--mahalanobis", action="store_true")
149 parser_kmeans.set_defaults(which="kmeans") 177 parser_kmeans.set_defaults(which="kmeans")
150 178
151 # measure 179 # measure
152 parser_measure = subparsers.add_parser( 180 parser_measure = subparsers.add_parser(
153 "measure", help="compute the entropy") 181 "measure", help="compute the entropy")
154 182
155 parser_measure.add_argument("--measure", 183 parser_measure.add_argument("--measure",
156 required=True, 184 required=True,
157 nargs="+", 185 nargs="+",
158 choices=[key for key in EVALUATION_METHODS], 186 choices=[key for key in EVALUATION_METHODS],
159 help="...") 187 help="...")
160 parser_measure.add_argument("--features", required=True, type=str, help="...") 188 parser_measure.add_argument("--features", required=True, type=str, help="...")
161 parser_measure.add_argument("--lst", required=True, type=str, help="...") 189 parser_measure.add_argument("--lst", required=True, type=str, help="...")
162 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 190 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
163 parser_measure.add_argument("--model", required=True, type=str, help="...") 191 parser_measure.add_argument("--model", required=True, type=str, help="...")
164 parser_measure.add_argument("--modeltype", 192 parser_measure.add_argument("--modeltype",
165 required=True, 193 required=True,
166 choices=[key for key in CLUSTERING_METHODS], 194 choices=[key for key in CLUSTERING_METHODS],
167 help="type of model for learning") 195 help="type of model for learning")
168 parser_measure.set_defaults(which="measure") 196 parser_measure.set_defaults(which="measure")
169 197
170 # disequilibrium 198 # disequilibrium
171 parser_disequilibrium = subparsers.add_parser( 199 parser_disequilibrium = subparsers.add_parser(
172 "disequilibrium", help="...") 200 "disequilibrium", help="...")
173 201
174 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 202 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
175 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 203 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
176 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 204 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
177 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 205 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
178 parser_disequilibrium.add_argument("--model-type", 206 parser_disequilibrium.add_argument("--model-type",
179 required=True, 207 required=True,
180 choices=["kmeans", "2", "3"], 208 choices=["kmeans", "2", "3"],
181 help="...") 209 help="...")
182 parser_disequilibrium.set_defaults(which="disequilibrium") 210 parser_disequilibrium.set_defaults(which="disequilibrium")
183 211
184 # Parse 212 # Parse
185 args = parser.parse_args() 213 args = parser.parse_args()
186 214
187 # Run commands 215 # Run commands
188 runner = SubCommandRunner({ 216 runner = SubCommandRunner({
189 "kmeans": kmeans_run, 217 "kmeans": kmeans_run,
190 "measure": measure_run, 218 "measure": measure_run,
191 "disequilibrium": disequilibrium_run 219 "disequilibrium": disequilibrium_run
192 }) 220 })
193 221
volia/clustering_modules/kmeans.py
1 1
2 from sklearn.cluster import KMeans 2 from sklearn.cluster import KMeans
3 import pickle 3 import pickle
4 from abstract_clustering import AbstractClustering 4 from abstract_clustering import AbstractClustering
5 5
6 class kmeans(): 6 class kmeans():
7 def __init__(self): 7 def __init__(self):
8 self.kmeans_model = None 8 self.kmeans_model = None
9 9
10 def predict(self, features): 10 def predict(self, features):
11 """ 11 """
12 12
13 @param features: 13 @param features:
14 @return: 14 @return:
15 """ 15 """
16 return self.kmeans_model.predict(features) 16 return self.kmeans_model.predict(features)
17 17
18 def load(self, model_path: str): 18 def load(self, model_path: str):
19 """ 19 """
20 20
21 @param model_path: 21 @param model_path:
22 @return: 22 @return:
23 """ 23 """
24 with open(model_path, "rb") as f: 24 with open(model_path, "rb") as f:
25 self.kmeans_model = pickle.load(f) 25 self.kmeans_model = pickle.load(f)
26 26
27 def save(self, model_path: str): 27 def save(self, model_path: str):
28 """ 28 """
29 29
30 @param model_path: 30 @param model_path:
31 @return: 31 @return:
32 """ 32 """
33 with open(model_path, "wb") as f: 33 with open(model_path, "wb") as f:
34 pickle.dump(self.kmeans_model, f) 34 pickle.dump(self.kmeans_model, f)
35 35
36 def fit(self, features, k: int): 36 def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
37 """ 37 """
38 38
39 @param features: 39 @param features:
40 @param k: 40 @param k:
41 @return: 41 @return:
42 """ 42 """
43 self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) 43 self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
44 44
volia/clustering_modules/kmeans_mahalanobis.py
1 1
2 2
3 from sklearn.cluster import KMeans 3 from sklearn.cluster import KMeans
4 import pickle 4 import pickle
5 import numpy as np 5 import numpy as np
6 import matplotlib.pyplot as plt 6 import matplotlib.pyplot as plt
7 from sklearn.manifold import TSNE 7 from sklearn.manifold import TSNE
8 from abstract_clustering import AbstractClustering 8 from abstract_clustering import AbstractClustering
9 9
10 class kmeansMahalanobis(): 10 class kmeansMahalanobis():
11 def __init__(self): 11 def __init__(self):
12 """ 12 """
13 13
14 """ 14 """
15 self.C = None 15 self.C = None
16 self.L = None 16 self.L = None
17 self.K = None 17 self.K = None
18 18
19 def predict(self, features): 19 def predict(self, features):
20 """ 20 """
21 21
22 @param features: 22 @param features:
23 @return: 23 @return:
24 """ 24 """
25 N = features.shape[0] 25 N = features.shape[0]
26 distances = np.zeros((N, self.K)) 26 distances = np.zeros((N, self.K))
27 for n in range(N): 27 for n in range(N):
28 for k in range(self.K): 28 for k in range(self.K):
29 distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) 29 distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
30 closest_cluster = np.argmin(distances, axis=1) 30 closest_cluster = np.argmin(distances, axis=1)
31 return closest_cluster 31 return closest_cluster
32 32
33 def load(self, model_path): 33 def load(self, model_path):
34 """ 34 """
35 35
36 @param model_path: 36 @param model_path:
37 @return: 37 @return:
38 """ 38 """
39 data = None 39 data = None
40 with open(model_path): 40 with open(model_path, "rb") as f:
41 data = pickle.load() 41 data = pickle.load(f)
42 if data is None: 42 if data is None:
43 raise Exception("Le modèle n'a pas pu être chargé") 43 raise Exception("Le modèle n'a pas pu être chargé")
44 else: 44 else:
45 self.C = data["C"] 45 self.C = data["C"]
46 self.L = data["L"] 46 self.L = data["L"]
47 self.K = data["K"] 47 self.K = data["K"]
48 48
49 def save(self, modelpath: str): 49 def save(self, modelpath: str):
50 """ 50 """
51 51
52 @param modelpath: 52 @param modelpath:
53 @return: 53 @return:
54 """ 54 """
55 data = { 55 data = {
56 "C": self.C, 56 "C": self.C,
57 "L": self.L, 57 "L": self.L,
58 "K": self.K 58 "K": self.K
59 } 59 }
60 with open(modelpath, "wb") as f: 60 with open(modelpath, "wb") as f:
61 pickle.dump(data, f) 61 pickle.dump(data, f)
62 62
63 def fit(self, features, K: int): 63 def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
64 self._train(features, K) 64 self._train(features, k, tol, maxiter, debug)
65 65
66 def _initialize_model(self, X, number_clusters): 66 def _initialize_model(self, X, number_clusters):
67 d = X.shape[1] 67 d = X.shape[1]
68 C = X[np.random.choice(X.shape[0], number_clusters)] 68 C = X[np.random.choice(X.shape[0], number_clusters)]
69 L = np.zeros((number_clusters, d, d)) 69 L = np.zeros((number_clusters, d, d))
70 for k in range(number_clusters): 70 for k in range(number_clusters):
71 L[k] = np.identity(d) 71 L[k] = np.identity(d)
72 return C, L 72 return C, L
73 73
74 def _dist(self, a, b, l): 74 def _dist(self, a, b, l):
75 ''' 75 '''
76 Distance euclidienne 76 Distance euclidienne
77 ''' 77 '''
78 a = np.reshape(a, (-1, 1)) 78 a = np.reshape(a, (-1, 1))
79 b = np.reshape(b, (-1, 1)) 79 b = np.reshape(b, (-1, 1))
80 result = np.transpose(a - b).dot(l).dot(a-b)[0][0] 80 result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
81 return result 81 return result
82 82
83 def _plot_iteration(self, iteration, points, clusters, centers): 83 def _plot_iteration(self, iteration, points, clusters, centers):
84 fig = plt.figure() 84 fig = plt.figure()
85 ax = fig.add_subplot(111) 85 ax = fig.add_subplot(111)
86 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) 86 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
87 87
88 #for center in centers: 88 #for center in centers:
89 # ax.scatter(center[0], center[1], s=50, c='red', marker='+') 89 # ax.scatter(center[0], center[1], s=50, c='red', marker='+')
90 ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') 90 ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
91 91
92 ax.set_xlabel('x') 92 ax.set_xlabel('x')
93 ax.set_ylabel('y') 93 ax.set_ylabel('y')
94 plt.colorbar(scatter) 94 plt.colorbar(scatter)
95 #plt.ylim(0, 1) 95 #plt.ylim(0, 1)
96 #plt.xlim(0, 1) 96 #plt.xlim(0, 1)
97 plt.savefig("test_" + str(iteration) + ".pdf") 97 plt.savefig("test_" + str(iteration) + ".pdf")
98 98
99 def _train(self, features, K: int): 99 def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
100 X = features 100 X = features
101 N = X.shape[0] 101 N = X.shape[0]
102 d = X.shape[1] 102 d = X.shape[1]
103 103
104 X_embedded = None
104 C, L = self._initialize_model(X, K) 105 C, L = self._initialize_model(X, K)
105 self.C = C 106 self.C = C
106 self.L = L 107 self.L = L
107 self.K = K 108 self.K = K
108 109
109 end_algo = False 110 end_algo = False
110 i = 0 111 i = 0
111 while not end_algo: 112 while not end_algo:
112 if i == 10: 113 if debug:
113 exit(1) 114 print("Iteration: ", i)
114 print("Iteration: ", i) 115
115 # Calcul matrix distance 116 # Calcul matrix distance
116 distances = np.zeros((N, K)) 117 distances = np.zeros((N, K))
117 118
118 for n in range(N): 119 for n in range(N):
119 for k in range(self.K): 120 for k in range(self.K):
120 distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) 121 distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
121 closest_cluster = np.argmin(distances, axis=1) 122 closest_cluster = np.argmin(distances, axis=1)
122 if i % 1 == 0: 123
123 # -- Debug tool ---------------------- 124 # -- Debug tool ----------------------
124 # TSNE 125 if debug and i % 10 == 0:
125 #X_embedded = np.concatenate((X, self.C), axis=0) 126 # TSNE if needed
126 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) 127 X_embedded = np.concatenate((X, self.C), axis=0)
128 if d > 2:
129 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
130
127 # Then plot 131 # Then plot
128 self._plot_iteration( 132 self._plot_iteration(
129 i, 133 i,
130 X_embedded[:X.shape[0]], 134 X_embedded[:X.shape[0]],
131 closest_cluster, 135 closest_cluster,
132 X_embedded[X.shape[0]:] 136 X_embedded[X.shape[0]:]
133 ) 137 )
134 # ------------------------------------ 138 # ------------------------------------
135 139
136 end_algo = True 140 old_c = self.C.copy()
137 for k in range(K): 141 for k in range(K):
138 # Find subset of X with values closed to the centroid c_k. 142 # Find subset of X with values closed to the centroid c_k.
139 X_sub = np.where(closest_cluster == k) 143 X_sub = np.where(closest_cluster == k)
140 X_sub = np.take(X, X_sub[0], axis=0) 144 X_sub = np.take(X, X_sub[0], axis=0)
141 if X_sub.shape[0] == 0: 145 if X_sub.shape[0] == 0:
142 continue 146 continue
143 np.mean(X_sub, axis=0) 147 np.mean(X_sub, axis=0)
144 C_new = np.mean(X_sub, axis=0) 148 C_new = np.mean(X_sub, axis=0)
145 149
146 # -- COMPUTE NEW LAMBDA (here named K) -- 150 # -- COMPUTE NEW LAMBDA (here named K) --
147 K_new = np.zeros((L.shape[1], L.shape[2])) 151 K_new = np.zeros((L.shape[1], L.shape[2]))
148 for x in X_sub: 152 for x in X_sub:
149 x = np.reshape(x, (-1, 1)) 153 x = np.reshape(x, (-1, 1))
150 c_tmp = np.reshape(C_new, (-1, 1)) 154 c_tmp = np.reshape(C_new, (-1, 1))
151 K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) 155 K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
152 K_new = K_new / X_sub.shape[0] 156 K_new = K_new / X_sub.shape[0]
153 K_new = np.linalg.pinv(K_new) 157 K_new = np.linalg.pinv(K_new)
154 158
155 if end_algo and (not (self.C[k] == C_new).all()): # If the same stop 159 #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
156 end_algo = False 160 # end_algo = False
157 self.C[k] = C_new 161 self.C[k] = C_new
158 self.L[k] = K_new 162 self.L[k] = K_new
163
164 diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
165 if diff > tol:
166 end_algo = False
167 if debug:
168 print(f"{diff}")
169 elif debug:
170 print(f"Tolerance threshold {tol} reached with diff {diff}")
171 end_algo = True
159 i = i + 1 172 i = i + 1
173 if i > maxiter:
174 end_algo = True
175 if debug:
176 print(f"Iteration {maxiter} reached")
160 177