Commit 4152e83df25ef19c8b048592e9629911bcf77e1a
1 parent
3c07f672ad
Exists in
master
Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …
…some problems with identity matrix usage and infinite or nan values.
Showing 3 changed files with 216 additions and 17 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst, read_labels | 4 | from core.data import read_features, read_lst, read_labels |
5 | import numpy as np | 5 | import numpy as np |
6 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
7 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | 8 | from clustering_modules.kmeans import kmeans |
9 | from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis | ||
9 | 10 | ||
10 | from sklearn.preprocessing import LabelEncoder | 11 | from sklearn.preprocessing import LabelEncoder |
11 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score | 12 | from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score |
12 | 13 | ||
13 | import core.measures | 14 | import core.measures |
14 | import json | 15 | import json |
15 | 16 | ||
16 | 17 | ||
17 | CLUSTERING_METHODS = { | 18 | CLUSTERING_METHODS = { |
18 | "k-means": kmeans() | 19 | "k-means": kmeans(), |
20 | "k-means-mahalanobis": kmeansMahalanobis() | ||
19 | } | 21 | } |
20 | 22 | ||
21 | EVALUATION_METHODS = { | 23 | EVALUATION_METHODS = { |
22 | "entropy": core.measures.entropy_score, | 24 | "entropy": core.measures.entropy_score, |
23 | "purity": core.measures.purity_score, | 25 | "purity": core.measures.purity_score, |
24 | "v-measure": v_measure_score, | 26 | "v-measure": v_measure_score, |
25 | "homogeneity": homogeneity_score, | 27 | "homogeneity": homogeneity_score, |
26 | "completeness": completeness_score, | 28 | "completeness": completeness_score, |
27 | } | 29 | } |
28 | 30 | ||
29 | 31 | ||
30 | def disequilibrium_run(): | 32 | def disequilibrium_run(): |
31 | pass | 33 | pass |
32 | 34 | ||
33 | 35 | ||
34 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | 36 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): |
35 | """ | 37 | """ |
36 | 38 | ||
37 | @param measure: | 39 | @param measure: |
38 | @param features: | 40 | @param features: |
39 | @param lst: | 41 | @param lst: |
40 | @param truelabels: | 42 | @param truelabels: |
41 | @param model: | 43 | @param model: |
42 | @param modeltype: | 44 | @param modeltype: |
43 | @return: | 45 | @return: |
44 | """ | 46 | """ |
45 | module = CLUSTERING_METHODS[modeltype] | 47 | module = CLUSTERING_METHODS[modeltype] |
46 | module.load(model) | 48 | module.load(model) |
47 | 49 | ||
48 | eval = {} | 50 | eval = {} |
49 | for ms in measure: | 51 | for ms in measure: |
50 | evaluation = EVALUATION_METHODS[ms] | 52 | evaluation = EVALUATION_METHODS[ms] |
51 | feats_dict = read_features(features) | 53 | feats_dict = read_features(features) |
52 | labels_dict = read_labels(truelabels) | 54 | labels_dict = read_labels(truelabels) |
53 | lst_dict = read_lst(lst) | 55 | lst_dict = read_lst(lst) |
54 | lst_keys = [key for key in lst_dict] | 56 | lst_keys = [key for key in lst_dict] |
55 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | 57 | feats = np.asarray([feats_dict[key] for key in lst_keys]) |
56 | Y_pred = module.predict(feats) | 58 | Y_pred = module.predict(feats) |
57 | Y_truth = [labels_dict[key][0] for key in lst_keys] | 59 | Y_truth = [labels_dict[key][0] for key in lst_keys] |
58 | 60 | ||
59 | le = LabelEncoder() | 61 | le = LabelEncoder() |
60 | le.fit(Y_truth) | 62 | le.fit(Y_truth) |
61 | Y_truth = le.transform(Y_truth) | 63 | Y_truth = le.transform(Y_truth) |
62 | 64 | ||
63 | eval[ms] = evaluation(Y_truth, Y_pred) | 65 | eval[ms] = evaluation(Y_truth, Y_pred) |
64 | 66 | ||
65 | print(json.dumps(eval)) | 67 | print(json.dumps(eval)) |
66 | 68 | ||
67 | 69 | ||
68 | 70 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False): | |
69 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | ||
70 | """ | 71 | """ |
71 | 72 | ||
72 | @param features: output features | 73 | @param features: output features |
73 | @param lst: list file | 74 | @param lst: list file |
74 | @param k: k (kmin if kmax specified) | 75 | @param k: k (kmin if kmax specified) |
75 | @param kmax: maximum k to compute | 76 | @param kmax: maximum k to compute |
76 | @param klist: list of k values to compute, ignore k value | 77 | @param klist: list of k values to compute, ignore k value |
77 | @param output: output file if kmax not specified, else, output directory | 78 | @param output: output file if kmax not specified, else, output directory |
79 | @param mahalanobis: distance option of k-means. | ||
78 | """ | 80 | """ |
79 | # -- READ FILES -- | 81 | # -- READ FILES -- |
80 | features_dict = read_features(features) | 82 | features_dict = read_features(features) |
81 | lst_dict = read_lst(lst) | 83 | lst_dict = read_lst(lst) |
82 | X = np.asarray([features_dict[x] for x in lst_dict]) | 84 | X = np.asarray([features_dict[x] for x in lst_dict]) |
83 | 85 | ||
84 | # Exception cases | 86 | # Exception cases |
85 | if kmax is None and klist is None and path.isdir(output): | 87 | if kmax is None and klist is None and path.isdir(output): |
86 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 88 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
87 | 89 | ||
88 | if (kmax is not None or klist is not None) and path.isfile(output): | 90 | if (kmax is not None or klist is not None) and path.isfile(output): |
89 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 91 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
90 | 92 | ||
91 | # Mono value case | 93 | # Mono value case |
92 | if kmax is None and klist is None: | 94 | if kmax is None and klist is None: |
93 | print(f"Computing clustering with k={k}") | 95 | print(f"Computing clustering with k={k}") |
94 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 96 | model = CLUSTERING_METHODS["k-means"] |
95 | preds = kmeans.predict(X) | 97 | if mahalanobis: |
96 | pickle.dump(kmeans, open(output, "wb")) | 98 | print("Computing with mahalanobis distance") |
99 | model = CLUSTERING_METHODS["k-means-mahalanobis"] | ||
100 | model.fit(X, k) | ||
101 | model.save(output) | ||
97 | 102 | ||
98 | # Multi values case with kmax | 103 | # Multi values case with kmax |
99 | if kmax is not None: | 104 | if kmax is not None: |
100 | if not path.isdir(output): | 105 | if not path.isdir(output): |
101 | mkdir(output) | 106 | mkdir(output) |
102 | Ks = range(k, kmax + 1) | 107 | Ks = range(k, kmax + 1) |
103 | for i in Ks: | 108 | for i in Ks: |
104 | print(f"Computing clustering with k={i}") | 109 | model = CLUSTERING_METHODS["k-means"] |
105 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | 110 | if mahalanobis: |
106 | preds = kmeans.predict(X) | 111 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
107 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | 112 | model.fit(X, i) |
113 | model.save(path.join(output, "clustering_" + str(i) + ".pkl")) | ||
108 | 114 | ||
109 | # Second multi values case with klist | 115 | # Second multi values case with klist |
110 | if klist is not None: | 116 | if klist is not None: |
111 | if not path.isdir(output): | 117 | if not path.isdir(output): |
112 | mkdir(output) | 118 | mkdir(output) |
113 | for k in klist: | 119 | for k in klist: |
114 | k = int(k) | 120 | k = int(k) |
115 | print(f"Computing clustering with k={k}") | 121 | model = CLUSTERING_METHODS["k-means"] |
116 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 122 | if mahalanobis: |
117 | preds = kmeans.predict(X) | 123 | print("Computing with mahalanobis distance") |
118 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | 124 | model = CLUSTERING_METHODS["k-means-mahalanobis"] |
125 | model.fit(X, k) | ||
126 | model.save(path.join(output, "clustering_" + str(k) + ".pkl")) | ||
119 | 127 | ||
120 | 128 | ||
121 | if __name__ == "__main__": | 129 | if __name__ == "__main__": |
122 | # Main parser | 130 | # Main parser |
123 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 131 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
124 | subparsers = parser.add_subparsers(title="action") | 132 | subparsers = parser.add_subparsers(title="action") |
125 | 133 | ||
126 | # kmeans | 134 | # kmeans |
127 | parser_kmeans = subparsers.add_parser( | 135 | parser_kmeans = subparsers.add_parser( |
128 | "kmeans", help="Compute clustering using k-means algorithm") | 136 | "kmeans", help="Compute clustering using k-means algorithm") |
129 | 137 | ||
130 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 138 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
131 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 139 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
132 | parser_kmeans.add_argument("-k", default=2, type=int, | 140 | parser_kmeans.add_argument("-k", default=2, type=int, |
133 | help="number of clusters to compute. It is kmin if kmax is specified.") | 141 | help="number of clusters to compute. It is kmin if kmax is specified.") |
134 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 142 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
135 | parser_kmeans.add_argument("--klist", nargs="+", | 143 | parser_kmeans.add_argument("--klist", nargs="+", |
136 | help="List of k values to test. As kmax, activate the multi values mod.") | 144 | help="List of k values to test. As kmax, activate the multi values mod.") |
137 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | 145 | parser_kmeans.add_argument("--output", |
146 | default=".kmeans", | ||
147 | help="output file if only k. Output directory if multiple kmax specified.") | ||
148 | parser_kmeans.add_argument("--mahalanobis", action="store_true") | ||
138 | parser_kmeans.set_defaults(which="kmeans") | 149 | parser_kmeans.set_defaults(which="kmeans") |
139 | 150 | ||
140 | # measure | 151 | # measure |
141 | parser_measure = subparsers.add_parser( | 152 | parser_measure = subparsers.add_parser( |
142 | "measure", help="compute the entropy") | 153 | "measure", help="compute the entropy") |
143 | 154 | ||
144 | parser_measure.add_argument("--measure", | 155 | parser_measure.add_argument("--measure", |
145 | required=True, | 156 | required=True, |
146 | nargs="+", | 157 | nargs="+", |
147 | choices=[key for key in EVALUATION_METHODS], | 158 | choices=[key for key in EVALUATION_METHODS], |
148 | help="...") | 159 | help="...") |
149 | parser_measure.add_argument("--features", required=True, type=str, help="...") | 160 | parser_measure.add_argument("--features", required=True, type=str, help="...") |
150 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | 161 | parser_measure.add_argument("--lst", required=True, type=str, help="...") |
151 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | 162 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") |
152 | parser_measure.add_argument("--model", required=True, type=str, help="...") | 163 | parser_measure.add_argument("--model", required=True, type=str, help="...") |
153 | parser_measure.add_argument("--modeltype", | 164 | parser_measure.add_argument("--modeltype", |
154 | required=True, | 165 | required=True, |
155 | choices=[key for key in CLUSTERING_METHODS], | 166 | choices=[key for key in CLUSTERING_METHODS], |
156 | help="type of model for learning") | 167 | help="type of model for learning") |
157 | parser_measure.set_defaults(which="measure") | 168 | parser_measure.set_defaults(which="measure") |
158 | 169 | ||
159 | # disequilibrium | 170 | # disequilibrium |
160 | parser_disequilibrium = subparsers.add_parser( | 171 | parser_disequilibrium = subparsers.add_parser( |
161 | "disequilibrium", help="...") | 172 | "disequilibrium", help="...") |
162 | 173 | ||
163 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | 174 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") |
164 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | 175 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") |
165 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | 176 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") |
166 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | 177 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") |
167 | parser_disequilibrium.add_argument("--model-type", | 178 | parser_disequilibrium.add_argument("--model-type", |
168 | required=True, | 179 | required=True, |
169 | choices=["kmeans", "2", "3"], | 180 | choices=["kmeans", "2", "3"], |
170 | help="...") | 181 | help="...") |
171 | parser_disequilibrium.set_defaults(which="disequilibrium") | 182 | parser_disequilibrium.set_defaults(which="disequilibrium") |
172 | 183 | ||
173 | # Parse | 184 | # Parse |
174 | args = parser.parse_args() | 185 | args = parser.parse_args() |
175 | 186 | ||
176 | # Run commands | 187 | # Run commands |
177 | runner = SubCommandRunner({ | 188 | runner = SubCommandRunner({ |
178 | "kmeans": kmeans_run, | 189 | "kmeans": kmeans_run, |
179 | "measure": measure_run, | 190 | "measure": measure_run, |
180 | "disequilibrium": disequilibrium_run | 191 | "disequilibrium": disequilibrium_run |
181 | }) | 192 | }) |
182 | 193 | ||
183 | runner.run(args.which, args.__dict__, remove="which") | 194 | runner.run(args.which, args.__dict__, remove="which") |
volia/clustering_modules/kmeans.py
1 | 1 | ||
2 | from sklearn.cluster import KMeans | 2 | from sklearn.cluster import KMeans |
3 | import pickle | 3 | import pickle |
4 | from abstract_clustering import AbstractClustering | 4 | from abstract_clustering import AbstractClustering |
5 | 5 | ||
6 | class kmeans(): | 6 | class kmeans(): |
7 | def __init__(self): | 7 | def __init__(self): |
8 | self.kmeans_model = None | 8 | self.kmeans_model = None |
9 | 9 | ||
10 | def predict(self, features): | 10 | def predict(self, features): |
11 | """ | ||
12 | |||
13 | @param features: | ||
14 | @return: | ||
15 | """ | ||
11 | return self.kmeans_model.predict(features) | 16 | return self.kmeans_model.predict(features) |
12 | 17 | ||
13 | def load(self, model_path): | 18 | def load(self, model_path: str): |
14 | self.kmeans_model = pickle.load(open(model_path, "rb")) | 19 | """ |
20 | |||
21 | @param model_path: | ||
22 | @return: | ||
23 | """ | ||
24 | with open(model_path, "rb") as f: | ||
25 | self.kmeans_model = pickle.load(f) | ||
26 | |||
27 | def save(self, model_path: str): | ||
28 | """ | ||
29 | |||
30 | @param model_path: | ||
31 | @return: | ||
32 | """ | ||
33 | with open(model_path, "wb") as f: | ||
34 | pickle.dump(self.kmeans_model, f) | ||
35 | |||
36 | def fit(self, features, k: int): | ||
37 | """ | ||
38 | |||
39 | @param features: | ||
40 | @param k: | ||
41 | @return: | ||
42 | """ | ||
43 | self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features) | ||
15 | 44 |
volia/clustering_modules/kmeans_mahalanobis.py
File was created | 1 | ||
2 | |||
3 | from sklearn.cluster import KMeans | ||
4 | import pickle | ||
5 | import numpy as np | ||
6 | import matplotlib.pyplot as plt | ||
7 | from sklearn.manifold import TSNE | ||
8 | from abstract_clustering import AbstractClustering | ||
9 | |||
10 | class kmeansMahalanobis(): | ||
11 | def __init__(self): | ||
12 | """ | ||
13 | |||
14 | """ | ||
15 | self.C = None | ||
16 | self.L = None | ||
17 | self.K = None | ||
18 | |||
19 | def predict(self, features): | ||
20 | """ | ||
21 | |||
22 | @param features: | ||
23 | @return: | ||
24 | """ | ||
25 | N = features.shape[0] | ||
26 | distances = np.zeros((N, self.K)) | ||
27 | for n in range(N): | ||
28 | for k in range(self.K): | ||
29 | distances[n][k] = self._dist(features[n], self.C[k], self.L[k]) | ||
30 | print(distances) | ||
31 | closest_cluster = np.argmin(distances, axis=1) | ||
32 | return closest_cluster | ||
33 | |||
34 | def load(self, model_path): | ||
35 | """ | ||
36 | |||
37 | @param model_path: | ||
38 | @return: | ||
39 | """ | ||
40 | data = None | ||
41 | with open(model_path): | ||
42 | data = pickle.load() | ||
43 | if data is None: | ||
44 | raise Exception("Le modèle n'a pas pu être chargé") | ||
45 | else: | ||
46 | self.C = data["C"] | ||
47 | self.L = data["L"] | ||
48 | self.K = data["K"] | ||
49 | |||
50 | def save(self, modelpath: str): | ||
51 | """ | ||
52 | |||
53 | @param modelpath: | ||
54 | @return: | ||
55 | """ | ||
56 | data = { | ||
57 | "C": self.C, | ||
58 | "L": self.L, | ||
59 | "K": self.K | ||
60 | } | ||
61 | with open(modelpath, "wb") as f: | ||
62 | pickle.dump(data, f) | ||
63 | |||
64 | def fit(self, features, K: int): | ||
65 | self._train(features, K) | ||
66 | |||
67 | def _initialize_model(self, X, number_clusters): | ||
68 | d = X.shape[1] | ||
69 | C = X[np.random.choice(X.shape[0], number_clusters)] | ||
70 | L = np.zeros((number_clusters, d, d)) | ||
71 | for k in range(number_clusters): | ||
72 | L[k] = np.identity(d) | ||
73 | return C, L | ||
74 | |||
75 | def _dist(self, a, b, l): | ||
76 | ''' | ||
77 | Distance euclidienne | ||
78 | ''' | ||
79 | a = np.reshape(a, (-1, 1)) | ||
80 | b = np.reshape(b, (-1, 1)) | ||
81 | result = np.transpose(a - b).dot(l).dot(a-b)[0][0] | ||
82 | return result | ||
83 | |||
84 | def _plot_iteration(self, iteration, points, clusters, centers): | ||
85 | fig = plt.figure() | ||
86 | ax = fig.add_subplot(111) | ||
87 | scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) | ||
88 | |||
89 | #for center in centers: | ||
90 | # ax.scatter(center[0], center[1], s=50, c='red', marker='+') | ||
91 | ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+') | ||
92 | |||
93 | ax.set_xlabel('x') | ||
94 | ax.set_ylabel('y') | ||
95 | plt.colorbar(scatter) | ||
96 | #plt.ylim(0, 1) | ||
97 | #plt.xlim(0, 1) | ||
98 | plt.savefig("test_" + str(iteration) + ".pdf") | ||
99 | |||
100 | def _train(self, features, K: int): | ||
101 | X = features | ||
102 | N = X.shape[0] | ||
103 | d = X.shape[1] | ||
104 | |||
105 | C, L = self._initialize_model(X, K) | ||
106 | self.C = C | ||
107 | self.L = L | ||
108 | self.K = K | ||
109 | |||
110 | end_algo = False | ||
111 | i = 0 | ||
112 | while not end_algo: | ||
113 | if i == 10: | ||
114 | exit(1) | ||
115 | print("Iteration: ", i) | ||
116 | # Calcul matrix distance | ||
117 | distances = np.zeros((N, K)) | ||
118 | |||
119 | for n in range(N): | ||
120 | for k in range(self.K): | ||
121 | distances[n][k] = self._dist(X[n], self.C[k], self.L[k]) | ||
122 | print(distances) | ||
123 | closest_cluster = np.argmin(distances, axis=1) | ||
124 | if i % 1 == 0: | ||
125 | # -- Debug tool ---------------------- | ||
126 | # TSNE | ||
127 | #X_embedded = np.concatenate((X, self.C), axis=0) | ||
128 | X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) | ||
129 | # Then plot | ||
130 | self._plot_iteration( | ||
131 | i, | ||
132 | X_embedded[:X.shape[0]], | ||
133 | closest_cluster, | ||
134 | X_embedded[X.shape[0]:] | ||
135 | ) | ||
136 | # ------------------------------------ | ||
137 | |||
138 | end_algo = True | ||
139 | for k in range(K): | ||
140 | # Find subset of X with values closed to the centroid c_k. | ||
141 | X_sub = np.where(closest_cluster == k) | ||
142 | X_sub = np.take(X, X_sub[0], axis=0) | ||
143 | np.mean(X_sub, axis=0) | ||
144 | C_new = np.mean(X_sub, axis=0) | ||
145 | |||
146 | # -- COMPUTE NEW LAMBDA (here named K) -- | ||
147 | K_new = np.zeros((L.shape[1], L.shape[2])) | ||
148 | for x in X_sub: | ||
149 | x = np.reshape(x, (-1, 1)) | ||
150 | c_tmp = np.reshape(C_new, (-1, 1)) | ||
151 | K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) | ||
152 | K_new = K_new / X_sub.shape[0] | ||
153 | K_new = np.linalg.inv(K_new) | ||
154 | |||
155 | if end_algo and (not (self.C[k] == C_new).all()): # If the same stop | ||
156 | end_algo = False | ||
157 | self.C[k] = C_new | ||
158 | self.L[k] = K_new | ||
159 | i = i + 1 | ||
160 |