Commit 4309b4a340144c6dff7757892cd5539e89e20538
1 parent
88d1d67e9d
Exists in
master
Adding constrained mahalanobis to help converging
Showing 2 changed files with 23 additions and 12 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | 9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis |
10 | 10 | ||
11 | from sklearn.preprocessing import LabelEncoder | 11 | from sklearn.preprocessing import LabelEncoder |
12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score | 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
13 | 13 | ||
14 | import core.measures | 14 | import core.measures |
15 | import json | 15 | import json |
16 | 16 | ||
17 | 17 | ||
18 | CLUSTERING_METHODS = { | 18 | CLUSTERING_METHODS = { |
19 | "k-means": kmeans(), | 19 | "k-means": kmeans(), |
20 | "k-means-mahalanobis": kmeansMahalanobis() | 20 | "k-means-mahalanobis": kmeansMahalanobis(), |
21 | "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True) | ||
21 | } | 22 | } |
22 | 23 | ||
23 | EVALUATION_METHODS = { | 24 | EVALUATION_METHODS = { |
24 | "entropy": core.measures.entropy_score, | 25 | "entropy": core.measures.entropy_score, |
25 | "purity": core.measures.purity_score, | 26 | "purity": core.measures.purity_score, |
26 | "v-measure": v_measure_score, | 27 | "v-measure": v_measure_score, |
27 | "homogeneity": homogeneity_score, | 28 | "homogeneity": homogeneity_score, |
28 | "completeness": completeness_score, | 29 | "completeness": completeness_score, |
29 | } | 30 | } |
30 | 31 | ||
31 | 32 | ||
32 | def disequilibrium_run(): | 33 | def disequilibrium_run(): |
33 | pass | 34 | pass |
34 | 35 | ||
35 | 36 | ||
36 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 37 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
37 | """ | 38 | """ |
38 | 39 | ||
39 | @param measure: | 40 | @param measure: |
40 | @param features: | 41 | @param features: |
41 | @param lst: | 42 | @param lst: |
42 | @param truelabels: | 43 | @param truelabels: |
43 | @param model: | 44 | @param model: |
44 | @param modeltype: | 45 | @param modeltype: |
45 | @return: | 46 | @return: |
46 | """ | 47 | """ |
47 | module = CLUSTERING_METHODS[modeltype] | 48 | module = CLUSTERING_METHODS[modeltype] |
48 | module.load(model) | 49 | module.load(model) |
49 | 50 | ||
50 | eval = {} | 51 | eval = {} |
51 | for ms in measure: | 52 | for ms in measure: |
52 | evaluation = EVALUATION_METHODS[ms] | 53 | evaluation = EVALUATION_METHODS[ms] |
53 | feats_dict = read_features(features) | 54 | feats_dict = read_features(features) |
54 | labels_dict = read_labels(truelabels) | 55 | labels_dict = read_labels(truelabels) |
55 | lst_dict = read_lst(lst) | 56 | lst_dict = read_lst(lst) |
56 | lst_keys = [key for key in lst_dict] | 57 | lst_keys = [key for key in lst_dict] |
57 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 58 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
58 | Y_pred = module.predict(feats) | 59 | Y_pred = module.predict(feats) |
59 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 60 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
60 | 61 | ||
61 | le = LabelEncoder() | 62 | le = LabelEncoder() |
62 | le.fit(Y_truth) | 63 | le.fit(Y_truth) |
63 | Y_truth = le.transform(Y_truth) | 64 | Y_truth = le.transform(Y_truth) |
64 | 65 | ||
65 | eval[ms] = evaluation(Y_truth, Y_pred) | 66 | eval[ms] = evaluation(Y_truth, Y_pred) |
66 | 67 | ||
67 | print(json.dumps(eval)) | 68 | print(json.dumps(eval)) |
68 | 69 | ||
69 | 70 | ||
70 | def kmeans_run(features: str, | 71 | def kmeans_run(features: str, |
71 | lst: str, | 72 | lst: str, |
72 | k:int, | 73 | k:int, |
73 | kmax: int, | 74 | kmax: int, |
74 | klist, | 75 | klist, |
75 | maxiter: int, | 76 | maxiter: int, |
76 | ninit: int, | 77 | ninit: int, |
77 | output: str, | 78 | output: str, |
78 | tol: float, | 79 | tol: float, |
79 | debug: bool = False, | 80 | debug: bool = False, |
80 | mahalanobis: str = False): | 81 | mahalanobis: str = False): |
81 | """ | 82 | """ |
82 | 83 | ||
83 | @param features: output features | 84 | @param features: output features |
84 | @param lst: list file | 85 | @param lst: list file |
85 | @param k: k (kmin if kmax specified) | 86 | @param k: k (kmin if kmax specified) |
86 | @param kmax: maximum k to compute | 87 | @param kmax: maximum k to compute |
87 | @param klist: list of k values to compute, ignore k value | 88 | @param klist: list of k values to compute, ignore k value |
88 | @param output: output file if kmax not specified, else, output directory | 89 | @param output: output file if kmax not specified, else, output directory |
89 | @param mahalanobis: distance option of k-means. | 90 | @param mahalanobis: distance option of k-means. |
90 | """ | 91 | """ |
91 | json_content = locals().copy() | 92 | json_content = locals().copy() |
92 | 93 | ||
93 | def fit_model(k: int, output_file): | 94 | def fit_model(k: int, output_file): |
94 | if debug: | 95 | if debug: |
95 | print(f"Computing clustering with k={k}") | 96 | print(f"Computing clustering with k={k}") |
96 | model = CLUSTERING_METHODS["k-means"] | 97 | model = CLUSTERING_METHODS["k-means"] |
97 | if mahalanobis: | 98 | if mahalanobis: |
98 | if debug: | 99 | if debug: |
99 | print("Mahalanobis activated") | 100 | print("Mahalanobis activated") |
100 | model = CLUSTERING_METHODS["k-means-mahalanobis"] | 101 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
101 | model.fit(X, k, tol, ninit, maxiter, debug) | 102 | model.fit(X, k, tol, ninit, maxiter, debug) |
102 | model.save(output_file) | 103 | model.save(output_file) |
103 | json_content["models"].append({ | 104 | json_content["models"].append({ |
104 | "model_file": output_file, | 105 | "model_file": output_file, |
105 | "k": k, | 106 | "k": k, |
106 | }) | 107 | }) |
107 | 108 | ||
108 | json_content["models"] = [] | 109 | json_content["models"] = [] |
109 | 110 | ||
110 | # -- READ FILES -- | 111 | # -- READ FILES -- |
111 | features_dict = read_features(features) | 112 | features_dict = read_features(features) |
112 | lst_dict = read_lst(lst) | 113 | lst_dict = read_lst(lst) |
113 | X = np.asarray([features_dict[x] for x in lst_dict]) | 114 | X = np.asarray([features_dict[x] for x in lst_dict]) |
114 | 115 | ||
115 | # Exception cases | 116 | # Exception cases |
116 | if kmax is None and klist is None and path.isdir(output): | 117 | if kmax is None and klist is None and path.isdir(output): |
117 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 118 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
118 | 119 | ||
119 | if (kmax is not None or klist is not None) and path.isfile(output): | 120 | if (kmax is not None or klist is not None) and path.isfile(output): |
120 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 121 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
121 | 122 | ||
122 | # Mono value case | 123 | # Mono value case |
123 | if kmax is None and klist is None: | 124 | if kmax is None and klist is None: |
124 | fit_model(k, output) | 125 | fit_model(k, output) |
125 | 126 | ||
126 | # Multi values case with kmax | 127 | # Multi values case with kmax |
127 | if kmax is not None: | 128 | if kmax is not None: |
128 | if not path.isdir(output): | 129 | if not path.isdir(output): |
129 | mkdir(output) | 130 | mkdir(output) |
130 | Ks = range(k, kmax + 1) | 131 | Ks = range(k, kmax + 1) |
131 | for i in Ks: | 132 | for i in Ks: |
132 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) | 133 | fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl")) |
133 | 134 | ||
134 | # Second multi values case with klist | 135 | # Second multi values case with klist |
135 | if klist is not None: | 136 | if klist is not None: |
136 | if not path.isdir(output): | 137 | if not path.isdir(output): |
137 | mkdir(output) | 138 | mkdir(output) |
138 | for k in klist: | 139 | for k in klist: |
139 | k = int(k) | 140 | k = int(k) |
140 | fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) | 141 | fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl")) |
141 | 142 | ||
142 | print(json_content) | 143 | print(json_content) |
143 | 144 | ||
144 | 145 | ||
145 | if __name__ == "__main__": | 146 | if __name__ == "__main__": |
146 | # Main parser | 147 | # Main parser |
147 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 148 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
148 | subparsers = parser.add_subparsers(title="action") | 149 | subparsers = parser.add_subparsers(title="action") |
149 | 150 | ||
150 | # kmeans | 151 | # kmeans |
151 | parser_kmeans = subparsers.add_parser( | 152 | parser_kmeans = subparsers.add_parser( |
152 | "kmeans", help="Compute clustering using k-means algorithm") | 153 | "kmeans", help="Compute clustering using k-means algorithm") |
153 | 154 | ||
154 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 155 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
155 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 156 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
156 | parser_kmeans.add_argument("-k", default=2, type=int, | 157 | parser_kmeans.add_argument("-k", default=2, type=int, |
157 | help="number of clusters to compute. It is kmin if kmax is specified.") | 158 | help="number of clusters to compute. It is kmin if kmax is specified.") |
158 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 159 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
159 | parser_kmeans.add_argument("--klist", nargs="+", | 160 | parser_kmeans.add_argument("--klist", nargs="+", |
160 | help="List of k values to test. As kmax, activate the multi values mod.") | 161 | help="List of k values to test. As kmax, activate the multi values mod.") |
161 | parser_kmeans.add_argument("--maxiter", | 162 | parser_kmeans.add_argument("--maxiter", |
162 | type=int, | 163 | type=int, |
163 | default=300, | 164 | default=300, |
164 | help="Max number of iteration before stoping if not converging") | 165 | help="Max number of iteration before stoping if not converging") |
165 | parser_kmeans.add_argument("--ninit", | 166 | parser_kmeans.add_argument("--ninit", |
166 | type=int, | 167 | type=int, |
167 | default=10, | 168 | default=10, |
168 | help="Number of time the k-means algorithm will be run with different centroid seeds.") | 169 | help="Number of time the k-means algorithm will be run with different centroid seeds.") |
169 | parser_kmeans.add_argument("--tol", | 170 | parser_kmeans.add_argument("--tol", |
170 | type=float, | 171 | type=float, |
171 | default=0.0001, | 172 | default=0.0001, |
172 | help="Tolerance to finish of distance between centroids and their updates.") | 173 | help="Tolerance to finish of distance between centroids and their updates.") |
173 | parser_kmeans.add_argument("--debug", action="store_true") | 174 | parser_kmeans.add_argument("--debug", action="store_true") |
174 | parser_kmeans.add_argument("--output", | 175 | parser_kmeans.add_argument("--output", |
175 | default=".kmeans", | 176 | default=".kmeans", |
176 | help="output file if only k. Output directory if multiple kmax specified.") | 177 | help="output file if only k. Output directory if multiple kmax specified.") |
177 | parser_kmeans.add_argument("--mahalanobis", action="store_true") | 178 | parser_kmeans.add_argument("--mahalanobis", action="store_true") |
178 | parser_kmeans.set_defaults(which="kmeans") | 179 | parser_kmeans.set_defaults(which="kmeans") |
179 | 180 | ||
180 | # measure | 181 | # measure |
181 | parser_measure = subparsers.add_parser( | 182 | parser_measure = subparsers.add_parser( |
182 | "measure", help="compute the entropy") | 183 | "measure", help="compute the entropy") |
183 | 184 | ||
184 | parser_measure.add_argument("--measure", | 185 | parser_measure.add_argument("--measure", |
185 | required=True, | 186 | required=True, |
186 | nargs="+", | 187 | nargs="+", |
187 | choices=[key for key in EVALUATION_METHODS], | 188 | choices=[key for key in EVALUATION_METHODS], |
188 | help="...") | 189 | help="...") |
189 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 190 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
190 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 191 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
191 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 192 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
192 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 193 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
193 | parser_measure.add_argument("--modeltype", | 194 | parser_measure.add_argument("--modeltype", |
194 | required=True, | 195 | required=True, |
195 | choices=[key for key in CLUSTERING_METHODS], | 196 | choices=[key for key in CLUSTERING_METHODS], |
196 | help="type of model for learning") | 197 | help="type of model for learning") |
197 | parser_measure.set_defaults(which="measure") | 198 | parser_measure.set_defaults(which="measure") |
198 | 199 | ||
199 | # disequilibrium | 200 | # disequilibrium |
200 | parser_disequilibrium = subparsers.add_parser( | 201 | parser_disequilibrium = subparsers.add_parser( |
201 | "disequilibrium", help="...") | 202 | "disequilibrium", help="...") |
202 | 203 | ||
203 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 204 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
204 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 205 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
205 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 206 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
206 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 207 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
207 | parser_disequilibrium.add_argument("--model-type", | 208 | parser_disequilibrium.add_argument("--model-type", |
208 | required=True, | 209 | required=True, |
209 | choices=["kmeans", "2", "3"], | 210 | choices=["kmeans", "2", "3"], |
210 | help="...") | 211 | help="...") |
211 | parser_disequilibrium.set_defaults(which="disequilibrium") | 212 | parser_disequilibrium.set_defaults(which="disequilibrium") |
212 | 213 | ||
213 | # Parse | 214 | # Parse |
214 | args = parser.parse_args() | 215 | args = parser.parse_args() |
215 | 216 | ||
216 | # Run commands | 217 | # Run commands |
217 | runner = SubCommandRunner({ | 218 | runner = SubCommandRunner({ |
218 | "kmeans": kmeans_run, | 219 | "kmeans": kmeans_run, |
219 | "measure": measure_run, | 220 | "measure": measure_run, |
220 | "disequilibrium": disequilibrium_run | 221 | "disequilibrium": disequilibrium_run |
221 | }) | 222 | }) |
222 | 223 | ||
223 | runner.run(args.which, args.__dict__, remove="which") | 224 | runner.run(args.which, args.__dict__, remove="which") |
224 | 225 |
volia/clustering_modules/kmeans_mahalanobis.py
1 | 1 | ||
2 | 2 | ||
3 | from sklearn.cluster import KMeans | 3 | from sklearn.cluster import KMeans |
4 | import pickle | 4 | import pickle |
5 | import numpy as np | 5 | import numpy as np |
6 | import matplotlib.pyplot as plt | 6 | import matplotlib.pyplot as plt |
7 | from sklearn.manifold import TSNE | 7 | from sklearn.manifold import TSNE |
8 | from abstract_clustering import AbstractClustering | 8 | from abstract_clustering import AbstractClustering |
9 | 9 | ||
10 | class kmeansMahalanobis(): | 10 | class kmeansMahalanobis(): |
11 | def __init__(self): | 11 | def __init__(self, constrained: bool = False): |
12 | """ | 12 | """ |
13 | 13 | ||
14 | """ | 14 | """ |
15 | self.C = None | 15 | self.C = None |
16 | self.L = None | 16 | self.L = None |
17 | self.K = None | 17 | self.K = None |
18 | self.constrained = constrained | ||
18 | 19 | ||
19 | def predict(self, features): | 20 | def predict(self, features): |
20 | """ | 21 | """ |
21 | 22 | ||
22 | @param features: | 23 | @param features: |
23 | @return: | 24 | @return: |
24 | """ | 25 | """ |
25 | N = features.shape[0] | 26 | N = features.shape[0] |
26 | distances = np.zeros((N, self.K)) | 27 | distances = np.zeros((N, self.K)) |
27 | for n in range(N): | 28 | for n in range(N): |
28 | for k in range(self.K): | 29 | for k in range(self.K): |
29 | distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) | 30 | distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) |
30 | closest_cluster = np.argmin(distances, axis=1) | 31 | closest_cluster = np.argmin(distances, axis=1) |
31 | return closest_cluster | 32 | return closest_cluster |
32 | 33 | ||
33 | def load(self, model_path): | 34 | def load(self, model_path): |
34 | """ | 35 | """ |
35 | 36 | ||
36 | @param model_path: | 37 | @param model_path: |
37 | @return: | 38 | @return: |
38 | """ | 39 | """ |
39 | data = None | 40 | data = None |
40 | with open(model_path, "rb") as f: | 41 | with open(model_path, "rb") as f: |
41 | data = pickle.load(f) | 42 | data = pickle.load(f) |
42 | if data is None: | 43 | if data is None: |
43 | raise Exception("Le modèle n'a pas pu être chargé") | 44 | raise Exception("Le modèle n'a pas pu être chargé") |
44 | else: | 45 | else: |
45 | self.C = data["C"] | 46 | self.C = data["C"] |
46 | self.L = data["L"] | 47 | self.L = data["L"] |
47 | self.K = data["K"] | 48 | self.K = data["K"] |
49 | self.constrained = data["constrained"] | ||
48 | 50 | ||
49 | def save(self, modelpath: str): | 51 | def save(self, modelpath: str): |
50 | """ | 52 | """ |
51 | 53 | ||
52 | @param modelpath: | 54 | @param modelpath: |
53 | @return: | 55 | @return: |
54 | """ | 56 | """ |
55 | data = { | 57 | data = { |
56 | "C": self.C, | 58 | "C": self.C, |
57 | "L": self.L, | 59 | "L": self.L, |
58 | "K": self.K | 60 | "K": self.K, |
61 | "constrained": self.constrained | ||
59 | } | 62 | } |
60 | with open(modelpath, "wb") as f: | 63 | with open(modelpath, "wb") as f: |
61 | pickle.dump(data, f) | 64 | pickle.dump(data, f) |
62 | 65 | ||
63 | def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): | 66 | def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): |
64 | results = [] | 67 | results = [] |
65 | for i in range(ninit): | 68 | for i in range(ninit): |
66 | results.append(self._train(features, k, tol, maxiter, debug)) | 69 | results.append(self._train(features, k, tol, maxiter, debug)) |
67 | losses = [v["loss"] for v in results] | 70 | losses = [v["loss"] for v in results] |
68 | best = results[losses.index(min(losses))] | 71 | best = results[losses.index(min(losses))] |
69 | if debug: | 72 | if debug: |
70 | print(f"best: {best['loss']} loss") | 73 | print(f"best: {best['loss']} loss") |
71 | self.C = best["C"] | 74 | self.C = best["C"] |
72 | self.L = best["L"] | 75 | self.L = best["L"] |
73 | self.K = best["K"] | 76 | self.K = best["K"] |
74 | 77 | ||
75 | def _initialize_model(self, X, number_clusters): | 78 | def _initialize_model(self, X, number_clusters): |
76 | d = X.shape[1] | 79 | d = X.shape[1] |
77 | C = X[np.random.choice(X.shape[0], number_clusters)] | 80 | C = X[np.random.choice(X.shape[0], number_clusters)] |
78 | L = np.zeros((number_clusters, d, d)) | 81 | L = np.zeros((number_clusters, d, d)) |
79 | for k in range(number_clusters): | 82 | for k in range(number_clusters): |
80 | L[k] = np.identity(d) | 83 | L[k] = np.identity(d) |
81 | return C, L | 84 | return C, L |
82 | 85 | ||
83 | def _dist(self, a, b, l): | 86 | def _dist(self, a, b, l): |
84 | ''' | 87 | ''' |
85 | Distance euclidienne | 88 | Distance euclidienne with mahalanobis |
86 | ''' | 89 | ''' |
87 | a = np.reshape(a, (-1, 1)) | 90 | a = np.reshape(a, (-1, 1)) |
88 | b = np.reshape(b, (-1, 1)) | 91 | b = np.reshape(b, (-1, 1)) |
89 | result = np.transpose(a - b).dot(l).dot(a-b)[0][0] | 92 | result = np.transpose(a - b).dot(l).dot(a - b)[0][0] |
90 | return result | 93 | return result |
91 | 94 | ||
92 | def _plot_iteration(self, iteration, points, clusters, centers): | 95 | def _plot_iteration(self, iteration, points, clusters, centers): |
93 | fig = plt.figure() | 96 | fig = plt.figure() |
94 | ax = fig.add_subplot(111) | 97 | ax = fig.add_subplot(111) |
95 | scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) | 98 | scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) |
96 | 99 | ||
97 | #for center in centers: | 100 | #for center in centers: |
98 | # ax.scatter(center[0], center[1], s=50, c='red', marker='+') | 101 | # ax.scatter(center[0], center[1], s=50, c='red', marker='+') |
99 | ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') | 102 | ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') |
100 | 103 | ||
101 | ax.set_xlabel('x') | 104 | ax.set_xlabel('x') |
102 | ax.set_ylabel('y') | 105 | ax.set_ylabel('y') |
103 | plt.colorbar(scatter) | 106 | plt.colorbar(scatter) |
104 | #plt.ylim(0, 1) | 107 | #plt.ylim(0, 1) |
105 | #plt.xlim(0, 1) | 108 | #plt.xlim(0, 1) |
106 | plt.savefig("test_" + str(iteration) + ".pdf") | 109 | plt.savefig("test_" + str(iteration) + ".pdf") |
107 | 110 | ||
108 | def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False): | 111 | def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False): |
109 | X = features | 112 | X = features |
110 | N = X.shape[0] | 113 | N = X.shape[0] |
111 | d = X.shape[1] | 114 | d = X.shape[1] |
112 | 115 | ||
113 | C, L = self._initialize_model(X, K) | 116 | C, L = self._initialize_model(X, K) |
114 | self.C = C | 117 | self.C = C |
115 | self.L = L | 118 | self.L = L |
116 | self.K = K | 119 | self.K = K |
117 | 120 | ||
118 | end_algo = False | 121 | end_algo = False |
119 | i = 0 | 122 | i = 0 |
120 | while not end_algo: | 123 | while not end_algo: |
121 | if debug: | 124 | if debug: |
122 | print("Iteration: ", i) | 125 | print("Iteration: ", i) |
123 | 126 | ||
124 | # Calcul matrix distance | 127 | # Calcul matrix distance |
125 | distances = np.zeros((N, self.K)) | 128 | distances = np.zeros((N, self.K)) |
126 | 129 | ||
127 | for n in range(N): | 130 | for n in range(N): |
128 | for k in range(self.K): | 131 | for k in range(self.K): |
129 | distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) | 132 | distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) |
130 | 133 | ||
131 | closest_cluster = np.argmin(distances, axis=1) | 134 | closest_cluster = np.argmin(distances, axis=1) |
135 | |||
132 | loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) | 136 | loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) |
133 | if debug: | 137 | if debug: |
134 | print(f"loss {loss}") | 138 | print(f"loss {loss}") |
135 | 139 | ||
136 | 140 | ||
137 | # -- Debug tool ---------------------- | 141 | # -- Debug tool ---------------------- |
138 | if debug and i % 10 == 0: | 142 | if debug and i % 1 == 0: |
139 | # TSNE if needed | 143 | # TSNE if needed |
140 | X_embedded = np.concatenate((X, self.C), axis=0) | 144 | X_embedded = np.concatenate((X, self.C), axis=0) |
141 | if d > 2: | 145 | if d > 2: |
142 | X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | 146 | X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0)) |
143 | 147 | ||
144 | # Then plot | 148 | # Then plot |
145 | self._plot_iteration( | 149 | self._plot_iteration( |
146 | i, | 150 | i, |
147 | X_embedded[:X.shape[0]], | 151 | X_embedded[:X.shape[0]], |
148 | closest_cluster, | 152 | closest_cluster, |
149 | X_embedded[X.shape[0]:] | 153 | X_embedded[X.shape[0]:] |
150 | ) | 154 | ) |
151 | # ------------------------------------ | 155 | # ------------------------------------ |
152 | 156 | ||
153 | old_c = self.C.copy() | 157 | old_c = self.C.copy() |
154 | for k in range(K): | 158 | for k in range(self.K): |
155 | # Find subset of X with values closed to the centroid c_k. | 159 | # Find subset of X with values closed to the centroid c_k. |
156 | X_sub = np.where(closest_cluster == k) | 160 | X_sub = np.where(closest_cluster == k) |
157 | X_sub = np.take(X, X_sub[0], axis=0) | 161 | X_sub = np.take(X, X_sub[0], axis=0) |
158 | if X_sub.shape[0] == 0: | 162 | if X_sub.shape[0] == 0: |
159 | continue | 163 | continue |
160 | np.mean(X_sub, axis=0) | 164 | |
161 | C_new = np.mean(X_sub, axis=0) | 165 | C_new = np.mean(X_sub, axis=0) |
162 | 166 | ||
163 | # -- COMPUTE NEW LAMBDA (here named K) -- | 167 | # -- COMPUTE NEW LAMBDA (here named K) -- |
164 | K_new = np.zeros((L.shape[1], L.shape[2])) | 168 | K_new = np.zeros((self.L.shape[1], self.L.shape[2])) |
169 | tmp = np.zeros((self.L.shape[1], self.L.shape[2])) | ||
165 | for x in X_sub: | 170 | for x in X_sub: |
166 | x = np.reshape(x, (-1, 1)) | 171 | x = np.reshape(x, (-1, 1)) |
167 | c_tmp = np.reshape(C_new, (-1, 1)) | 172 | c_tmp = np.reshape(C_new, (-1, 1)) |
168 | K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) | 173 | #K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) |
169 | K_new = K_new / X_sub.shape[0] | 174 | |
175 | tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose()) | ||
176 | if self.constrained: | ||
177 | K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d) | ||
178 | else: | ||
179 | K_new = tmp / X_sub.shape[0] | ||
170 | K_new = np.linalg.pinv(K_new) | 180 | K_new = np.linalg.pinv(K_new) |
171 | 181 | ||
172 | #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop | 182 | #if end_algo and (not (self.C[k] == C_new).all()): # If the same stop |
173 | # end_algo = False | 183 | # end_algo = False |
174 | self.C[k] = C_new | 184 | self.C[k] = C_new |
175 | self.L[k] = K_new | 185 | self.L[k] = K_new |
176 | 186 | ||
177 | 187 | ||
178 | diff = np.sum(np.absolute((self.C - old_c) / old_c * 100)) | 188 | diff = np.sum(np.absolute((self.C - old_c) / old_c * 100)) |
179 | if diff > tol: | 189 | if diff > tol: |
180 | end_algo = False | 190 | end_algo = False |
181 | if debug: | 191 | if debug: |
182 | print(f"{diff}") | 192 | print(f"{diff}") |
183 | else: | 193 | else: |
184 | if debug: | 194 | if debug: |
185 | print(f"Tolerance threshold {tol} reached with diff {diff}") | 195 | print(f"Tolerance threshold {tol} reached with diff {diff}") |
186 | end_algo = True | 196 | end_algo = True |
187 | 197 | ||
188 | i = i + 1 | 198 | i = i + 1 |
189 | if i > maxiter: | 199 | if i > maxiter: |
190 | end_algo = True | 200 | end_algo = True |
191 | if debug: | 201 | if debug: |
192 | print(f"Iteration {maxiter} reached") | 202 | print(f"Iteration {maxiter} reached") |
193 | return { | 203 | return { |
194 | "loss": loss, | 204 | "loss": loss, |
195 | "C": self.C, | 205 | "C": self.C, |
196 | "K": self.K, | 206 | "K": self.K, |
197 | "L": self.L | 207 | "L": self.L |
198 | } | 208 | } |