Commit 4152e83df25ef19c8b048592e9629911bcf77e1a

Authored by quillotm
1 parent 3c07f672ad
Exists in master

Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …

…some problems with identity matrix usage and infinite or nan values.

Showing 3 changed files with 216 additions and 17 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst, read_labels 4 from core.data import read_features, read_lst, read_labels
5 import numpy as np 5 import numpy as np
6 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
7 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans 8 from clustering_modules.kmeans import kmeans
9 from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
9 10
10 from sklearn.preprocessing import LabelEncoder 11 from sklearn.preprocessing import LabelEncoder
11 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score 12 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
12 13
13 import core.measures 14 import core.measures
14 import json 15 import json
15 16
16 17
17 CLUSTERING_METHODS = { 18 CLUSTERING_METHODS = {
18 "k-means": kmeans() 19 "k-means": kmeans(),
20 "k-means-mahalanobis": kmeansMahalanobis()
19 } 21 }
20 22
21 EVALUATION_METHODS = { 23 EVALUATION_METHODS = {
22 "entropy": core.measures.entropy_score, 24 "entropy": core.measures.entropy_score,
23 "purity": core.measures.purity_score, 25 "purity": core.measures.purity_score,
24 "v-measure": v_measure_score, 26 "v-measure": v_measure_score,
25 "homogeneity": homogeneity_score, 27 "homogeneity": homogeneity_score,
26 "completeness": completeness_score, 28 "completeness": completeness_score,
27 } 29 }
28 30
29 31
30 def disequilibrium_run(): 32 def disequilibrium_run():
31 pass 33 pass
32 34
33 35
34 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): 36 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
35 """ 37 """
36 38
37 @param measure: 39 @param measure:
38 @param features: 40 @param features:
39 @param lst: 41 @param lst:
40 @param truelabels: 42 @param truelabels:
41 @param model: 43 @param model:
42 @param modeltype: 44 @param modeltype:
43 @return: 45 @return:
44 """ 46 """
45 module = CLUSTERING_METHODS[modeltype] 47 module = CLUSTERING_METHODS[modeltype]
46 module.load(model) 48 module.load(model)
47 49
48 eval = {} 50 eval = {}
49 for ms in measure: 51 for ms in measure:
50 evaluation = EVALUATION_METHODS[ms] 52 evaluation = EVALUATION_METHODS[ms]
51 feats_dict = read_features(features) 53 feats_dict = read_features(features)
52 labels_dict = read_labels(truelabels) 54 labels_dict = read_labels(truelabels)
53 lst_dict = read_lst(lst) 55 lst_dict = read_lst(lst)
54 lst_keys = [key for key in lst_dict] 56 lst_keys = [key for key in lst_dict]
55 feats = np.asarray([feats_dict[key] for key in lst_keys]) 57 feats = np.asarray([feats_dict[key] for key in lst_keys])
56 Y_pred = module.predict(feats) 58 Y_pred = module.predict(feats)
57 Y_truth = [labels_dict[key][0] for key in lst_keys] 59 Y_truth = [labels_dict[key][0] for key in lst_keys]
58 60
59 le = LabelEncoder() 61 le = LabelEncoder()
60 le.fit(Y_truth) 62 le.fit(Y_truth)
61 Y_truth = le.transform(Y_truth) 63 Y_truth = le.transform(Y_truth)
62 64
63 eval[ms] = evaluation(Y_truth, Y_pred) 65 eval[ms] = evaluation(Y_truth, Y_pred)
64 66
65 print(json.dumps(eval)) 67 print(json.dumps(eval))
66 68
67 69
68 70 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
69 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
70 """ 71 """
71 72
72 @param features: output features 73 @param features: output features
73 @param lst: list file 74 @param lst: list file
74 @param k: k (kmin if kmax specified) 75 @param k: k (kmin if kmax specified)
75 @param kmax: maximum k to compute 76 @param kmax: maximum k to compute
76 @param klist: list of k values to compute, ignore k value 77 @param klist: list of k values to compute, ignore k value
77 @param output: output file if kmax not specified, else, output directory 78 @param output: output file if kmax not specified, else, output directory
79 @param mahalanobis: distance option of k-means.
78 """ 80 """
79 # -- READ FILES -- 81 # -- READ FILES --
80 features_dict = read_features(features) 82 features_dict = read_features(features)
81 lst_dict = read_lst(lst) 83 lst_dict = read_lst(lst)
82 X = np.asarray([features_dict[x] for x in lst_dict]) 84 X = np.asarray([features_dict[x] for x in lst_dict])
83 85
84 # Exception cases 86 # Exception cases
85 if kmax is None and klist is None and path.isdir(output): 87 if kmax is None and klist is None and path.isdir(output):
86 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 88 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
87 89
88 if (kmax is not None or klist is not None) and path.isfile(output): 90 if (kmax is not None or klist is not None) and path.isfile(output):
89 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 91 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
90 92
91 # Mono value case 93 # Mono value case
92 if kmax is None and klist is None: 94 if kmax is None and klist is None:
93 print(f"Computing clustering with k={k}") 95 print(f"Computing clustering with k={k}")
94 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 96 model = CLUSTERING_METHODS["k-means"]
95 preds = kmeans.predict(X) 97 if mahalanobis:
96 pickle.dump(kmeans, open(output, "wb")) 98 print("Computing with mahalanobis distance")
99 model = CLUSTERING_METHODS["k-means-mahalanobis"]
100 model.fit(X, k)
101 model.save(output)
97 102
98 # Multi values case with kmax 103 # Multi values case with kmax
99 if kmax is not None: 104 if kmax is not None:
100 if not path.isdir(output): 105 if not path.isdir(output):
101 mkdir(output) 106 mkdir(output)
102 Ks = range(k, kmax + 1) 107 Ks = range(k, kmax + 1)
103 for i in Ks: 108 for i in Ks:
104 print(f"Computing clustering with k={i}") 109 model = CLUSTERING_METHODS["k-means"]
105 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) 110 if mahalanobis:
106 preds = kmeans.predict(X) 111 model = CLUSTERING_METHODS["k-means-mahalanobis"]
107 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) 112 model.fit(X, i)
113 model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
108 114
109 # Second multi values case with klist 115 # Second multi values case with klist
110 if klist is not None: 116 if klist is not None:
111 if not path.isdir(output): 117 if not path.isdir(output):
112 mkdir(output) 118 mkdir(output)
113 for k in klist: 119 for k in klist:
114 k = int(k) 120 k = int(k)
115 print(f"Computing clustering with k={k}") 121 model = CLUSTERING_METHODS["k-means"]
116 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 122 if mahalanobis:
117 preds = kmeans.predict(X) 123 print("Computing with mahalanobis distance")
118 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) 124 model = CLUSTERING_METHODS["k-means-mahalanobis"]
125 model.fit(X, k)
126 model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
119 127
120 128
121 if __name__ == "__main__": 129 if __name__ == "__main__":
122 # Main parser 130 # Main parser
123 parser = argparse.ArgumentParser(description="Clustering methods to apply") 131 parser = argparse.ArgumentParser(description="Clustering methods to apply")
124 subparsers = parser.add_subparsers(title="action") 132 subparsers = parser.add_subparsers(title="action")
125 133
126 # kmeans 134 # kmeans
127 parser_kmeans = subparsers.add_parser( 135 parser_kmeans = subparsers.add_parser(
128 "kmeans", help="Compute clustering using k-means algorithm") 136 "kmeans", help="Compute clustering using k-means algorithm")
129 137
130 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 138 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
131 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 139 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
132 parser_kmeans.add_argument("-k", default=2, type=int, 140 parser_kmeans.add_argument("-k", default=2, type=int,
133 help="number of clusters to compute. It is kmin if kmax is specified.") 141 help="number of clusters to compute. It is kmin if kmax is specified.")
134 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 142 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
135 parser_kmeans.add_argument("--klist", nargs="+", 143 parser_kmeans.add_argument("--klist", nargs="+",
136 help="List of k values to test. As kmax, activate the multi values mod.") 144 help="List of k values to test. As kmax, activate the multi values mod.")
137 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") 145 parser_kmeans.add_argument("--output",
146 default=".kmeans",
147 help="output file if only k. Output directory if multiple kmax specified.")
148 parser_kmeans.add_argument("--mahalanobis", action="store_true")
138 parser_kmeans.set_defaults(which="kmeans") 149 parser_kmeans.set_defaults(which="kmeans")
139 150
140 # measure 151 # measure
141 parser_measure = subparsers.add_parser( 152 parser_measure = subparsers.add_parser(
142 "measure", help="compute the entropy") 153 "measure", help="compute the entropy")
143 154
144 parser_measure.add_argument("--measure", 155 parser_measure.add_argument("--measure",
145 required=True, 156 required=True,
146 nargs="+", 157 nargs="+",
147 choices=[key for key in EVALUATION_METHODS], 158 choices=[key for key in EVALUATION_METHODS],
148 help="...") 159 help="...")
149 parser_measure.add_argument("--features", required=True, type=str, help="...") 160 parser_measure.add_argument("--features", required=True, type=str, help="...")
150 parser_measure.add_argument("--lst", required=True, type=str, help="...") 161 parser_measure.add_argument("--lst", required=True, type=str, help="...")
151 parser_measure.add_argument("--truelabels", required=True, type=str, help="...") 162 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
152 parser_measure.add_argument("--model", required=True, type=str, help="...") 163 parser_measure.add_argument("--model", required=True, type=str, help="...")
153 parser_measure.add_argument("--modeltype", 164 parser_measure.add_argument("--modeltype",
154 required=True, 165 required=True,
155 choices=[key for key in CLUSTERING_METHODS], 166 choices=[key for key in CLUSTERING_METHODS],
156 help="type of model for learning") 167 help="type of model for learning")
157 parser_measure.set_defaults(which="measure") 168 parser_measure.set_defaults(which="measure")
158 169
159 # disequilibrium 170 # disequilibrium
160 parser_disequilibrium = subparsers.add_parser( 171 parser_disequilibrium = subparsers.add_parser(
161 "disequilibrium", help="...") 172 "disequilibrium", help="...")
162 173
163 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") 174 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
164 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") 175 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
165 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") 176 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
166 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") 177 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
167 parser_disequilibrium.add_argument("--model-type", 178 parser_disequilibrium.add_argument("--model-type",
168 required=True, 179 required=True,
169 choices=["kmeans", "2", "3"], 180 choices=["kmeans", "2", "3"],
170 help="...") 181 help="...")
171 parser_disequilibrium.set_defaults(which="disequilibrium") 182 parser_disequilibrium.set_defaults(which="disequilibrium")
172 183
173 # Parse 184 # Parse
174 args = parser.parse_args() 185 args = parser.parse_args()
175 186
176 # Run commands 187 # Run commands
177 runner = SubCommandRunner({ 188 runner = SubCommandRunner({
178 "kmeans": kmeans_run, 189 "kmeans": kmeans_run,
179 "measure": measure_run, 190 "measure": measure_run,
180 "disequilibrium": disequilibrium_run 191 "disequilibrium": disequilibrium_run
181 }) 192 })
182 193
183 runner.run(args.which, args.__dict__, remove="which") 194 runner.run(args.which, args.__dict__, remove="which")
volia/clustering_modules/kmeans.py
1 1
2 from sklearn.cluster import KMeans 2 from sklearn.cluster import KMeans
3 import pickle 3 import pickle
4 from abstract_clustering import AbstractClustering 4 from abstract_clustering import AbstractClustering
5 5
6 class kmeans(): 6 class kmeans():
7 def __init__(self): 7 def __init__(self):
8 self.kmeans_model = None 8 self.kmeans_model = None
9 9
10 def predict(self, features): 10 def predict(self, features):
11 """
12
13 @param features:
14 @return:
15 """
11 return self.kmeans_model.predict(features) 16 return self.kmeans_model.predict(features)
12 17
13 def load(self, model_path): 18 def load(self, model_path: str):
14 self.kmeans_model = pickle.load(open(model_path, "rb")) 19 """
20
21 @param model_path:
22 @return:
23 """
24 with open(model_path, "rb") as f:
25 self.kmeans_model = pickle.load(f)
26
27 def save(self, model_path: str):
28 """
29
30 @param model_path:
31 @return:
32 """
33 with open(model_path, "wb") as f:
34 pickle.dump(self.kmeans_model, f)
35
36 def fit(self, features, k: int):
37 """
38
39 @param features:
40 @param k:
41 @return:
42 """
43 self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
15 44
volia/clustering_modules/kmeans_mahalanobis.py
File was created 1
2
3 from sklearn.cluster import KMeans
4 import pickle
5 import numpy as np
6 import matplotlib.pyplot as plt
7 from sklearn.manifold import TSNE
8 from abstract_clustering import AbstractClustering
9
10 class kmeansMahalanobis():
11 def __init__(self):
12 """
13
14 """
15 self.C = None
16 self.L = None
17 self.K = None
18
19 def predict(self, features):
20 """
21
22 @param features:
23 @return:
24 """
25 N = features.shape[0]
26 distances = np.zeros((N, self.K))
27 for n in range(N):
28 for k in range(self.K):
29 distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
30 print(distances)
31 closest_cluster = np.argmin(distances, axis=1)
32 return closest_cluster
33
34 def load(self, model_path):
35 """
36
37 @param model_path:
38 @return:
39 """
40 data = None
41 with open(model_path):
42 data = pickle.load()
43 if data is None:
44 raise Exception("Le modèle n'a pas pu être chargé")
45 else:
46 self.C = data["C"]
47 self.L = data["L"]
48 self.K = data["K"]
49
50 def save(self, modelpath: str):
51 """
52
53 @param modelpath:
54 @return:
55 """
56 data = {
57 "C": self.C,
58 "L": self.L,
59 "K": self.K
60 }
61 with open(modelpath, "wb") as f:
62 pickle.dump(data, f)
63
64 def fit(self, features, K: int):
65 self._train(features, K)
66
67 def _initialize_model(self, X, number_clusters):
68 d = X.shape[1]
69 C = X[np.random.choice(X.shape[0], number_clusters)]
70 L = np.zeros((number_clusters, d, d))
71 for k in range(number_clusters):
72 L[k] = np.identity(d)
73 return C, L
74
75 def _dist(self, a, b, l):
76 '''
77 Distance euclidienne
78 '''
79 a = np.reshape(a, (-1, 1))
80 b = np.reshape(b, (-1, 1))
81 result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
82 return result
83
84 def _plot_iteration(self, iteration, points, clusters, centers):
85 fig = plt.figure()
86 ax = fig.add_subplot(111)
87 scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
88
89 #for center in centers:
90 # ax.scatter(center[0], center[1], s=50, c='red', marker='+')
91 ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
92
93 ax.set_xlabel('x')
94 ax.set_ylabel('y')
95 plt.colorbar(scatter)
96 #plt.ylim(0, 1)
97 #plt.xlim(0, 1)
98 plt.savefig("test_" + str(iteration) + ".pdf")
99
100 def _train(self, features, K: int):
101 X = features
102 N = X.shape[0]
103 d = X.shape[1]
104
105 C, L = self._initialize_model(X, K)
106 self.C = C
107 self.L = L
108 self.K = K
109
110 end_algo = False
111 i = 0
112 while not end_algo:
113 if i == 10:
114 exit(1)
115 print("Iteration: ", i)
116 # Calcul matrix distance
117 distances = np.zeros((N, K))
118
119 for n in range(N):
120 for k in range(self.K):
121 distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
122 print(distances)
123 closest_cluster = np.argmin(distances, axis=1)
124 if i % 1 == 0:
125 # -- Debug tool ----------------------
126 # TSNE
127 #X_embedded = np.concatenate((X, self.C), axis=0)
128 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
129 # Then plot
130 self._plot_iteration(
131 i,
132 X_embedded[:X.shape[0]],
133 closest_cluster,
134 X_embedded[X.shape[0]:]
135 )
136 # ------------------------------------
137
138 end_algo = True
139 for k in range(K):
140 # Find subset of X with values closed to the centroid c_k.
141 X_sub = np.where(closest_cluster == k)
142 X_sub = np.take(X, X_sub[0], axis=0)
143 np.mean(X_sub, axis=0)
144 C_new = np.mean(X_sub, axis=0)
145
146 # -- COMPUTE NEW LAMBDA (here named K) --
147 K_new = np.zeros((L.shape[1], L.shape[2]))
148 for x in X_sub:
149 x = np.reshape(x, (-1, 1))
150 c_tmp = np.reshape(C_new, (-1, 1))
151 K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
152 K_new = K_new / X_sub.shape[0]
153 K_new = np.linalg.inv(K_new)
154
155 if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
156 end_algo = False
157 self.C[k] = C_new
158 self.L[k] = K_new
159 i = i + 1
160