Commit 4309b4a340144c6dff7757892cd5539e89e20538

Authored by quillotm
1 parent 88d1d67e9d
Exists in master

Adding constrained mahalanobis to help converging

Showing 2 changed files with 23 additions and 12 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis 9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
10 10
11 from sklearn.preprocessing import LabelEncoder 11 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
13 13
14 import core.measures 14 import core.measures
15 import json 15 import json
16 16
17 17
18 CLUSTERING_METHODS = { 18 CLUSTERING_METHODS = {
19 "k-means": kmeans(), 19 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis() 20 "k-means-mahalanobis": kmeansMahalanobis(),
21 "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)
21 } 22 }
22 23
23 EVALUATION_METHODS = { 24 EVALUATION_METHODS = {
24 "entropy": core.measures.entropy_score, 25 "entropy": core.measures.entropy_score,
25 "purity": core.measures.purity_score, 26 "purity": core.measures.purity_score,
26 "v-measure": v_measure_score, 27 "v-measure": v_measure_score,
27 "homogeneity": homogeneity_score, 28 "homogeneity": homogeneity_score,
28 "completeness": completeness_score, 29 "completeness": completeness_score,
29 } 30 }
30 31
31 32
32 def disequilibrium_run(): 33 def disequilibrium_run():
33 pass 34 pass
34 35
35 36
36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 37 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
37 """ 38 """
38 39
39 @param measure: 40 @param measure:
40 @param features: 41 @param features:
41 @param lst: 42 @param lst:
42 @param truelabels: 43 @param truelabels:
43 @param model: 44 @param model:
44 @param modeltype: 45 @param modeltype:
45 @return: 46 @return:
46 """ 47 """
47 module = CLUSTERING_METHODS[modeltype] 48 module = CLUSTERING_METHODS[modeltype]
48 module.load(model) 49 module.load(model)
49 50
50 eval = {} 51 eval = {}
51 for ms in measure: 52 for ms in measure:
52 evaluation = EVALUATION_METHODS[ms] 53 evaluation = EVALUATION_METHODS[ms]
53 feats_dict = read_features(features) 54 feats_dict = read_features(features)
54 labels_dict = read_labels(truelabels) 55 labels_dict = read_labels(truelabels)
55 lst_dict = read_lst(lst) 56 lst_dict = read_lst(lst)
56 lst_keys = [key for key in lst_dict] 57 lst_keys = [key for key in lst_dict]
57 feats = np.asarray([feats_dict[key] for key in lst_keys]) 58 feats = np.asarray([feats_dict[key] for key in lst_keys])
58 Y_pred = module.predict(feats) 59 Y_pred = module.predict(feats)
59 Y_truth = [labels_dict[key][0] for key in lst_keys] 60 Y_truth = [labels_dict[key][0] for key in lst_keys]
60 61
61 le = LabelEncoder() 62 le = LabelEncoder()
62 le.fit(Y_truth) 63 le.fit(Y_truth)
63 Y_truth = le.transform(Y_truth) 64 Y_truth = le.transform(Y_truth)
64 65
65 eval[ms] = evaluation(Y_truth, Y_pred) 66 eval[ms] = evaluation(Y_truth, Y_pred)
66 67
67 print(json.dumps(eval)) 68 print(json.dumps(eval))
68 69
69 70
70 def kmeans_run(features: str, 71 def kmeans_run(features: str,
71 lst: str, 72 lst: str,
72 k:int, 73 k:int,
73 kmax: int, 74 kmax: int,
74 klist, 75 klist,
75 maxiter: int, 76 maxiter: int,
76 ninit: int, 77 ninit: int,
77 output: str, 78 output: str,
78 tol: float, 79 tol: float,
79 debug: bool = False, 80 debug: bool = False,
80 mahalanobis: str = False): 81 mahalanobis: str = False):
81 """ 82 """
82 83
83 @param features: output features 84 @param features: output features
84 @param lst: list file 85 @param lst: list file
85 @param k: k (kmin if kmax specified) 86 @param k: k (kmin if kmax specified)
86 @param kmax: maximum k to compute 87 @param kmax: maximum k to compute
87 @param klist: list of k values to compute, ignore k value 88 @param klist: list of k values to compute, ignore k value
88 @param output: output file if kmax not specified, else, output directory 89 @param output: output file if kmax not specified, else, output directory
89 @param mahalanobis: distance option of k-means. 90 @param mahalanobis: distance option of k-means.
90 """ 91 """
91 json_content = locals().copy() 92 json_content = locals().copy()
92 93
93 def fit_model(k: int, output_file): 94 def fit_model(k: int, output_file):
94 if debug: 95 if debug:
95 print(f"Computing clustering with k={k}") 96 print(f"Computing clustering with k={k}")
96 model = CLUSTERING_METHODS["k-means"] 97 model = CLUSTERING_METHODS["k-means"]
97 if mahalanobis: 98 if mahalanobis:
98 if debug: 99 if debug:
99 print("Mahalanobis activated") 100 print("Mahalanobis activated")
100 model = CLUSTERING_METHODS["k-means-mahalanobis"] 101 model = CLUSTERING_METHODS["k-means-mahalanobis"]
101 model.fit(X, k, tol, ninit, maxiter, debug) 102 model.fit(X, k, tol, ninit, maxiter, debug)
102 model.save(output_file) 103 model.save(output_file)
103 json_content["models"].append({ 104 json_content["models"].append({
104 "model_file": output_file, 105 "model_file": output_file,
105 "k": k, 106 "k": k,
106 }) 107 })
107 108
108 json_content["models"] = [] 109 json_content["models"] = []
109 110
110 # -- READ FILES -- 111 # -- READ FILES --
111 features_dict = read_features(features) 112 features_dict = read_features(features)
112 lst_dict = read_lst(lst) 113 lst_dict = read_lst(lst)
113 X = np.asarray([features_dict[x] for x in lst_dict]) 114 X = np.asarray([features_dict[x] for x in lst_dict])
114 115
115 # Exception cases 116 # Exception cases
116 if kmax is None and klist is None and path.isdir(output): 117 if kmax is None and klist is None and path.isdir(output):
117 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 118 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
118 119
119 if (kmax is not None or klist is not None) and path.isfile(output): 120 if (kmax is not None or klist is not None) and path.isfile(output):
120 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 121 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
121 122
122 # Mono value case 123 # Mono value case
123 if kmax is None and klist is None: 124 if kmax is None and klist is None:
124 fit_model(k, output) 125 fit_model(k, output)
125 126
126 # Multi values case with kmax 127 # Multi values case with kmax
127 if kmax is not None: 128 if kmax is not None:
128 if not path.isdir(output): 129 if not path.isdir(output):
129 mkdir(output) 130 mkdir(output)
130 Ks = range(k, kmax + 1) 131 Ks = range(k, kmax + 1)
131 for i in Ks: 132 for i in Ks:
132 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) 133 fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
133 134
134 # Second multi values case with klist 135 # Second multi values case with klist
135 if klist is not None: 136 if klist is not None:
136 if not path.isdir(output): 137 if not path.isdir(output):
137 mkdir(output) 138 mkdir(output)
138 for k in klist: 139 for k in klist:
139 k = int(k) 140 k = int(k)
140 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) 141 fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
141 142
142 print(json_content) 143 print(json_content)
143 144
144 145
145 if __name__ == "__main__": 146 if __name__ == "__main__":
146 # Main parser 147 # Main parser
147 parser = argparse.ArgumentParser(description="Clustering methods to apply") 148 parser = argparse.ArgumentParser(description="Clustering methods to apply")
148 subparsers = parser.add_subparsers(title="action") 149 subparsers = parser.add_subparsers(title="action")
149 150
150 # kmeans 151 # kmeans
151 parser_kmeans = subparsers.add_parser( 152 parser_kmeans = subparsers.add_parser(
152 "kmeans", help="Compute clustering using k-means algorithm") 153 "kmeans", help="Compute clustering using k-means algorithm")
153 154
154 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 155 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
155 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 156 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
156 parser_kmeans.add_argument("-k", default=2, type=int, 157 parser_kmeans.add_argument("-k", default=2, type=int,
157 help="number of clusters to compute. It is kmin if kmax is specified.") 158 help="number of clusters to compute. It is kmin if kmax is specified.")
158 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 159 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
159 parser_kmeans.add_argument("--klist", nargs="+", 160 parser_kmeans.add_argument("--klist", nargs="+",
160 help="List of k values to test. As kmax, activate the multi values mod.") 161 help="List of k values to test. As kmax, activate the multi values mod.")
161 parser_kmeans.add_argument("--maxiter", 162 parser_kmeans.add_argument("--maxiter",
162 type=int, 163 type=int,
163 default=300, 164 default=300,
164 help="Max number of iteration before stoping if not converging") 165 help="Max number of iteration before stoping if not converging")
165 parser_kmeans.add_argument("--ninit", 166 parser_kmeans.add_argument("--ninit",
166 type=int, 167 type=int,
167 default=10, 168 default=10,
168 help="Number of time the k-means algorithm will be run with different centroid seeds.") 169 help="Number of time the k-means algorithm will be run with different centroid seeds.")
169 parser_kmeans.add_argument("--tol", 170 parser_kmeans.add_argument("--tol",
170 type=float, 171 type=float,
171 default=0.0001, 172 default=0.0001,
172 help="Tolerance to finish of distance between centroids and their updates.") 173 help="Tolerance to finish of distance between centroids and their updates.")
173 parser_kmeans.add_argument("--debug", action="store_true") 174 parser_kmeans.add_argument("--debug", action="store_true")
174 parser_kmeans.add_argument("--output", 175 parser_kmeans.add_argument("--output",
175 default=".kmeans", 176 default=".kmeans",
176 help="output file if only k. Output directory if multiple kmax specified.") 177 help="output file if only k. Output directory if multiple kmax specified.")
177 parser_kmeans.add_argument("--mahalanobis", action="store_true") 178 parser_kmeans.add_argument("--mahalanobis", action="store_true")
178 parser_kmeans.set_defaults(which="kmeans") 179 parser_kmeans.set_defaults(which="kmeans")
179 180
180 # measure 181 # measure
181 parser_measure = subparsers.add_parser( 182 parser_measure = subparsers.add_parser(
182 "measure", help="compute the entropy") 183 "measure", help="compute the entropy")
183 184
184 parser_measure.add_argument("--measure", 185 parser_measure.add_argument("--measure",
185 required=True, 186 required=True,
186 nargs="+", 187 nargs="+",
187 choices=[key for key in EVALUATION_METHODS], 188 choices=[key for key in EVALUATION_METHODS],
188 help="...") 189 help="...")
189 parser_measure.add_argument("--features", required=True, type=str, help="...") 190 parser_measure.add_argument("--features", required=True, type=str, help="...")
190 parser_measure.add_argument("--lst", required=True, type=str, help="...") 191 parser_measure.add_argument("--lst", required=True, type=str, help="...")
191 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 192 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
192 parser_measure.add_argument("--model", required=True, type=str, help="...") 193 parser_measure.add_argument("--model", required=True, type=str, help="...")
193 parser_measure.add_argument("--modeltype", 194 parser_measure.add_argument("--modeltype",
194 required=True, 195 required=True,
195 choices=[key for key in CLUSTERING_METHODS], 196 choices=[key for key in CLUSTERING_METHODS],
196 help="type of model for learning") 197 help="type of model for learning")
197 parser_measure.set_defaults(which="measure") 198 parser_measure.set_defaults(which="measure")
198 199
199 # disequilibrium 200 # disequilibrium
200 parser_disequilibrium = subparsers.add_parser( 201 parser_disequilibrium = subparsers.add_parser(
201 "disequilibrium", help="...") 202 "disequilibrium", help="...")
202 203
203 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 204 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
204 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 205 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
205 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 206 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
206 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 207 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
207 parser_disequilibrium.add_argument("--model-type", 208 parser_disequilibrium.add_argument("--model-type",
208 required=True, 209 required=True,
209 choices=["kmeans", "2", "3"], 210 choices=["kmeans", "2", "3"],
210 help="...") 211 help="...")
211 parser_disequilibrium.set_defaults(which="disequilibrium") 212 parser_disequilibrium.set_defaults(which="disequilibrium")
212 213
213 # Parse 214 # Parse
214 args = parser.parse_args() 215 args = parser.parse_args()
215 216
216 # Run commands 217 # Run commands
217 runner = SubCommandRunner({ 218 runner = SubCommandRunner({
218 "kmeans": kmeans_run, 219 "kmeans": kmeans_run,
219 "measure": measure_run, 220 "measure": measure_run,
220 "disequilibrium": disequilibrium_run 221 "disequilibrium": disequilibrium_run
221 }) 222 })
222 223
223 runner.run(args.which, args.__dict__, remove="which") 224 runner.run(args.which, args.__dict__, remove="which")
224 225
volia/clustering_modules/kmeans_mahalanobis.py
1 1
2 2
3 from sklearn.cluster import KMeans 3 from sklearn.cluster import KMeans
4 import pickle 4 import pickle
5 import numpy as np 5 import numpy as np
6 import matplotlib.pyplot as plt 6 import matplotlib.pyplot as plt
7 from sklearn.manifold import TSNE 7 from sklearn.manifold import TSNE
8 from abstract_clustering import AbstractClustering 8 from abstract_clustering import AbstractClustering
9 9
10 class kmeansMahalanobis(): 10 class kmeansMahalanobis():
11 def __init__(self): 11 def __init__(self, constrained: bool = False):
12 """ 12 """
13 13
14 """ 14 """
15 self.C = None 15 self.C = None
16 self.L = None 16 self.L = None
17 self.K = None 17 self.K = None
18 self.constrained = constrained
18 19
19 def predict(self, features): 20 def predict(self, features):
20 """ 21 """
21 22
22 @param features: 23 @param features:
23 @return: 24 @return:
24 """ 25 """
25 N = features.shape[0] 26 N = features.shape[0]
26 distances = np.zeros((N, self.K)) 27 distances = np.zeros((N, self.K))
27 for n in range(N): 28 for n in range(N):
28 for k in range(self.K): 29 for k in range(self.K):
29 distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) 30 distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
30 closest_cluster = np.argmin(distances, axis=1) 31 closest_cluster = np.argmin(distances, axis=1)
31 return closest_cluster 32 return closest_cluster
32 33
33 def load(self, model_path): 34 def load(self, model_path):
34 """ 35 """
35 36
36 @param model_path: 37 @param model_path:
37 @return: 38 @return:
38 """ 39 """
39 data = None 40 data = None
40 with open(model_path, "rb") as f: 41 with open(model_path, "rb") as f:
41 data = pickle.load(f) 42 data = pickle.load(f)
42 if data is None: 43 if data is None:
43 raise Exception("Le modèle n'a pas pu être chargé") 44 raise Exception("Le modèle n'a pas pu être chargé")
44 else: 45 else:
45 self.C = data["C"] 46 self.C = data["C"]
46 self.L = data["L"] 47 self.L = data["L"]
47 self.K = data["K"] 48 self.K = data["K"]
49 self.constrained = data["constrained"]
48 50
49 def save(self, modelpath: str): 51 def save(self, modelpath: str):
50 """ 52 """
51 53
52 @param modelpath: 54 @param modelpath:
53 @return: 55 @return:
54 """ 56 """
55 data = { 57 data = {
56 "C": self.C, 58 "C": self.C,
57 "L": self.L, 59 "L": self.L,
58 "K": self.K 60 "K": self.K,
61 "constrained": self.constrained
59 } 62 }
60 with open(modelpath, "wb") as f: 63 with open(modelpath, "wb") as f:
61 pickle.dump(data, f) 64 pickle.dump(data, f)
62 65
63 def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): 66 def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
64 results = [] 67 results = []
65 for i in range(ninit): 68 for i in range(ninit):
66 results.append(self._train(features, k, tol, maxiter, debug)) 69 results.append(self._train(features, k, tol, maxiter, debug))
67 losses = [v["loss"] for v in results] 70 losses = [v["loss"] for v in results]
68 best = results[losses.index(min(losses))] 71 best = results[losses.index(min(losses))]
69 if debug: 72 if debug:
70 print(f"best: {best['loss']} loss") 73 print(f"best: {best['loss']} loss")
71 self.C = best["C"] 74 self.C = best["C"]
72 self.L = best["L"] 75 self.L = best["L"]
73 self.K = best["K"] 76 self.K = best["K"]
74 77
75 def _initialize_model(self, X, number_clusters): 78 def _initialize_model(self, X, number_clusters):
76 d = X.shape[1] 79 d = X.shape[1]
77 C = X[np.random.choice(X.shape[0], number_clusters)] 80 C = X[np.random.choice(X.shape[0], number_clusters)]
78 L = np.zeros((number_clusters, d, d)) 81 L = np.zeros((number_clusters, d, d))
79 for k in range(number_clusters): 82 for k in range(number_clusters):
80 L[k] = np.identity(d) 83 L[k] = np.identity(d)
81 return C, L 84 return C, L
82 85
83 def _dist(self, a, b, l): 86 def _dist(self, a, b, l):
84 ''' 87 '''
85 Distance euclidienne 88 Distance euclidienne with mahalanobis
86 ''' 89 '''
87 a = np.reshape(a, (-1, 1)) 90 a = np.reshape(a, (-1, 1))
88 b = np.reshape(b, (-1, 1)) 91 b = np.reshape(b, (-1, 1))
89 result = np.transpose(a - b).dot(l).dot(a-b)[0][0] 92 result = np.transpose(a - b).dot(l).dot(a - b)[0][0]
90 return result 93 return result
91 94
92 def _plot_iteration(self, iteration, points, clusters, centers): 95 def _plot_iteration(self, iteration, points, clusters, centers):
93 fig = plt.figure() 96 fig = plt.figure()
94 ax = fig.add_subplot(111) 97 ax = fig.add_subplot(111)
95 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) 98 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
96 99
97 #for center in centers: 100 #for center in centers:
98 # ax.scatter(center[0], center[1], s=50, c='red', marker='+') 101 # ax.scatter(center[0], center[1], s=50, c='red', marker='+')
99 ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') 102 ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
100 103
101 ax.set_xlabel('x') 104 ax.set_xlabel('x')
102 ax.set_ylabel('y') 105 ax.set_ylabel('y')
103 plt.colorbar(scatter) 106 plt.colorbar(scatter)
104 #plt.ylim(0, 1) 107 #plt.ylim(0, 1)
105 #plt.xlim(0, 1) 108 #plt.xlim(0, 1)
106 plt.savefig("test_" + str(iteration) + ".pdf") 109 plt.savefig("test_" + str(iteration) + ".pdf")
107 110
108 def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False): 111 def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
109 X = features 112 X = features
110 N = X.shape[0] 113 N = X.shape[0]
111 d = X.shape[1] 114 d = X.shape[1]
112 115
113 C, L = self._initialize_model(X, K) 116 C, L = self._initialize_model(X, K)
114 self.C = C 117 self.C = C
115 self.L = L 118 self.L = L
116 self.K = K 119 self.K = K
117 120
118 end_algo = False 121 end_algo = False
119 i = 0 122 i = 0
120 while not end_algo: 123 while not end_algo:
121 if debug: 124 if debug:
122 print("Iteration: ", i) 125 print("Iteration: ", i)
123 126
124 # Calcul matrix distance 127 # Calcul matrix distance
125 distances = np.zeros((N, self.K)) 128 distances = np.zeros((N, self.K))
126 129
127 for n in range(N): 130 for n in range(N):
128 for k in range(self.K): 131 for k in range(self.K):
129 distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) 132 distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
130 133
131 closest_cluster = np.argmin(distances, axis=1) 134 closest_cluster = np.argmin(distances, axis=1)
135
132 loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) 136 loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
133 if debug: 137 if debug:
134 print(f"loss {loss}") 138 print(f"loss {loss}")
135 139
136 140
137 # -- Debug tool ---------------------- 141 # -- Debug tool ----------------------
138 if debug and i % 10 == 0: 142 if debug and i % 1 == 0:
139 # TSNE if needed 143 # TSNE if needed
140 X_embedded = np.concatenate((X, self.C), axis=0) 144 X_embedded = np.concatenate((X, self.C), axis=0)
141 if d > 2: 145 if d > 2:
142 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) 146 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0))
143 147
144 # Then plot 148 # Then plot
145 self._plot_iteration( 149 self._plot_iteration(
146 i, 150 i,
147 X_embedded[:X.shape[0]], 151 X_embedded[:X.shape[0]],
148 closest_cluster, 152 closest_cluster,
149 X_embedded[X.shape[0]:] 153 X_embedded[X.shape[0]:]
150 ) 154 )
151 # ------------------------------------ 155 # ------------------------------------
152 156
153 old_c = self.C.copy() 157 old_c = self.C.copy()
154 for k in range(K): 158 for k in range(self.K):
155 # Find subset of X with values closed to the centroid c_k. 159 # Find subset of X with values closed to the centroid c_k.
156 X_sub = np.where(closest_cluster == k) 160 X_sub = np.where(closest_cluster == k)
157 X_sub = np.take(X, X_sub[0], axis=0) 161 X_sub = np.take(X, X_sub[0], axis=0)
158 if X_sub.shape[0] == 0: 162 if X_sub.shape[0] == 0:
159 continue 163 continue
160 np.mean(X_sub, axis=0) 164
161 C_new = np.mean(X_sub, axis=0) 165 C_new = np.mean(X_sub, axis=0)
162 166
163 # -- COMPUTE NEW LAMBDA (here named K) -- 167 # -- COMPUTE NEW LAMBDA (here named K) --
164 K_new = np.zeros((L.shape[1], L.shape[2])) 168 K_new = np.zeros((self.L.shape[1], self.L.shape[2]))
169 tmp = np.zeros((self.L.shape[1], self.L.shape[2]))
165 for x in X_sub: 170 for x in X_sub:
166 x = np.reshape(x, (-1, 1)) 171 x = np.reshape(x, (-1, 1))
167 c_tmp = np.reshape(C_new, (-1, 1)) 172 c_tmp = np.reshape(C_new, (-1, 1))
168 K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) 173 #K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
169 K_new = K_new / X_sub.shape[0] 174
175 tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose())
176 if self.constrained:
177 K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d)
178 else:
179 K_new = tmp / X_sub.shape[0]
170 K_new = np.linalg.pinv(K_new) 180 K_new = np.linalg.pinv(K_new)
171 181
172 #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop 182 #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
173 # end_algo = False 183 # end_algo = False
174 self.C[k] = C_new 184 self.C[k] = C_new
175 self.L[k] = K_new 185 self.L[k] = K_new
176 186
177 187
178 diff = np.sum(np.absolute((self.C - old_c) / old_c * 100)) 188 diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
179 if diff > tol: 189 if diff > tol:
180 end_algo = False 190 end_algo = False
181 if debug: 191 if debug:
182 print(f"{diff}") 192 print(f"{diff}")
183 else: 193 else:
184 if debug: 194 if debug:
185 print(f"Tolerance threshold {tol} reached with diff {diff}") 195 print(f"Tolerance threshold {tol} reached with diff {diff}")
186 end_algo = True 196 end_algo = True
187 197
188 i = i + 1 198 i = i + 1
189 if i > maxiter: 199 if i > maxiter:
190 end_algo = True 200 end_algo = True
191 if debug: 201 if debug:
192 print(f"Iteration {maxiter} reached") 202 print(f"Iteration {maxiter} reached")
193 return { 203 return {
194 "loss": loss, 204 "loss": loss,
195 "C": self.C, 205 "C": self.C,
196 "K": self.K, 206 "K": self.K,
197 "L": self.L 207 "L": self.L
198 } 208 }