Commit 6bc3b63707bac9240e9df369b071a6f764aa5d2f

Authored by Mathias
1 parent 3b7e63994c
Exists in master

Now, it saves kmeans parameters on the json file.

Showing 1 changed file with 13 additions and 4 deletions Inline Diff

scripts/evaluations/clustering.py
1 ''' 1 '''
2 This script allows the user to evaluate a classification system on new labels using clustering methods. 2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding). 3 The algorithms are applied on the given latent space (embedding).
4 ''' 4 '''
5 import argparse 5 import argparse
6 import numpy as np 6 import numpy as np
7 import pandas as pd 7 import pandas as pd
8 import os 8 import os
9 import time 9 import time
10 import pickle 10 import pickle
11 import csv 11 import csv
12 import json
12 13
13 from sklearn.preprocessing import LabelEncoder 14 from sklearn.preprocessing import LabelEncoder
14 from sklearn.metrics.pairwise import pairwise_distances 15 from sklearn.metrics.pairwise import pairwise_distances
15 from sklearn.cluster import KMeans 16 from sklearn.cluster import KMeans
16 from sklearn.manifold import TSNE 17 from sklearn.manifold import TSNE
17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score 18 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
18 import matplotlib.pyplot as plt 19 import matplotlib.pyplot as plt
19 20
20 from volia.data_io import read_features,read_lst 21 from volia.data_io import read_features,read_lst
21 from volia.measures import entropy_score, purity_score 22 from volia.measures import entropy_score, purity_score
22 23
23 ''' 24 '''
24 TODO: 25 TODO:
25 - Add an option allowing the user to choose the number of 26 - Add an option allowing the user to choose the number of
26 clustering to train in order to compute the average and the 27 clustering to train in order to compute the average and the
27 ''' 28 '''
28 29
29 30
30 def train_clustering(label_encoder, feats, classes, outdir): 31 def train_clustering(label_encoder, feats, classes, outdir):
31 num_classes = len(label_encoder.classes_) 32 num_classes = len(label_encoder.classes_)
32 33
33 # Compute KMEANS clustering on data 34 # Compute KMEANS clustering on data
35 kmeans_parameters = {
36 "n_clusters": num_classes,
37 "n_init": 100,
38 "tol": 10-6,
39 "algorithm": "elkan"
40 }
41 with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
42 json.dump(kmeans_parameters, f)
43
44 # Save parameters
45
34 estimator = KMeans( 46 estimator = KMeans(
35 n_clusters=num_classes, 47 **kmeans_parameters
36 n_init=100,
37 tol=10-6,
38 algorithm="elkan"
39 ) 48 )
40 estimator.fit(feats) 49 estimator.fit(feats)
41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") 50 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
42 51
43 with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f: 52 with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f:
44 pickle.dump(estimator, f) 53 pickle.dump(estimator, f)
45 54
46 # contains distance to each cluster for each sample 55 # contains distance to each cluster for each sample
47 dist_space = estimator.transform(feats) 56 dist_space = estimator.transform(feats)
48 predictions = np.argmin(dist_space, axis=1) 57 predictions = np.argmin(dist_space, axis=1)
49 58
50 # gives each cluster a name (considering most represented character) 59 # gives each cluster a name (considering most represented character)
51 dataframe = pd.DataFrame({ 60 dataframe = pd.DataFrame({
52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), 61 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
53 "prediction": pd.Series(predictions) 62 "prediction": pd.Series(predictions)
54 }) 63 })
55 64
56 def find_cluster_name_fn(c): 65 def find_cluster_name_fn(c):
57 mask = dataframe["prediction"] == c 66 mask = dataframe["prediction"] == c
58 return dataframe[mask]["label"].value_counts(sort=False).idxmax() 67 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
59 68
60 cluster_names = list(map(find_cluster_name_fn, range(num_classes))) 69 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
61 predicted_labels = le.transform( 70 predicted_labels = le.transform(
62 [cluster_names[pred] for pred in predictions]) 71 [cluster_names[pred] for pred in predictions])
63 72
64 # F-measure 73 # F-measure
65 fscores = f1_score(labels, predicted_labels, average=None) 74 fscores = f1_score(labels, predicted_labels, average=None)
66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) 75 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
67 76
68 # Entropy 77 # Entropy
69 _, _, entropy = entropy_score(labels, predicted_labels) 78 _, _, entropy = entropy_score(labels, predicted_labels)
70 79
71 # Homogenity 80 # Homogenity
72 homogeneity = homogeneity_score(labels, predicted_labels) 81 homogeneity = homogeneity_score(labels, predicted_labels)
73 82
74 # Completeness 83 # Completeness
75 completeness = completeness_score(labels, predicted_labels) 84 completeness = completeness_score(labels, predicted_labels)
76 85
77 # V-Measure 86 # V-Measure
78 v_measure = v_measure_score(labels, predicted_labels) 87 v_measure = v_measure_score(labels, predicted_labels)
79 88
80 # Purity 89 # Purity
81 purity_scores = purity_score(labels, predicted_labels) 90 purity_scores = purity_score(labels, predicted_labels)
82 purity_class_score = purity_scores["purity_class_score"] 91 purity_class_score = purity_scores["purity_class_score"]
83 purity_cluster_score = purity_scores["purity_cluster_score"] 92 purity_cluster_score = purity_scores["purity_cluster_score"]
84 K = purity_scores["K"] 93 K = purity_scores["K"]
85 94
86 # Write results 95 # Write results
87 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: 96 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
88 print(f"F1-scores for each classes:\n{fscores_str}", file=fd) 97 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
89 print(f"Entropy: {entropy}", file=fd) 98 print(f"Entropy: {entropy}", file=fd)
90 print(f"Global score : {np.mean(fscores)}", file=fd) 99 print(f"Global score : {np.mean(fscores)}", file=fd)
91 print(f"Homogeneity: {homogeneity}", file=fd) 100 print(f"Homogeneity: {homogeneity}", file=fd)
92 print(f"completeness: {completeness}", file=fd) 101 print(f"completeness: {completeness}", file=fd)
93 print(f"v-measure: {v_measure}", file=fd) 102 print(f"v-measure: {v_measure}", file=fd)
94 print(f"purity class score: {purity_class_score}", file=fd) 103 print(f"purity class score: {purity_class_score}", file=fd)
95 print(f"purity cluster score: {purity_cluster_score}", file=fd) 104 print(f"purity cluster score: {purity_cluster_score}", file=fd)
96 print(f"purity overall evaluation criterion (K): {K}", file=fd) 105 print(f"purity overall evaluation criterion (K): {K}", file=fd)
97 106
98 # Process t-SNE and plot 107 # Process t-SNE and plot
99 tsne_estimator = TSNE() 108 tsne_estimator = TSNE()
100 embeddings = tsne_estimator.fit_transform(feats) 109 embeddings = tsne_estimator.fit_transform(feats)
101 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( 110 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
102 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) 111 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
103 112
104 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) 113 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
105 for c, name in enumerate(le.classes_): 114 for c, name in enumerate(le.classes_):
106 c_mask = np.where(labels == c) 115 c_mask = np.where(labels == c)
107 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 116 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
108 117
109 try: 118 try:
110 id_cluster = cluster_names.index(name) 119 id_cluster = cluster_names.index(name)
111 except ValueError: 120 except ValueError:
112 print("WARNING: no cluster found for {}".format(name)) 121 print("WARNING: no cluster found for {}".format(name))
113 continue 122 continue
114 c_mask = np.where(predictions == id_cluster) 123 c_mask = np.where(predictions == id_cluster)
115 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 124 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
116 125
117 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 126 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
118 axe1.set_title("true labels") 127 axe1.set_title("true labels")
119 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 128 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
120 axe2.set_title("predicted cluster label") 129 axe2.set_title("predicted cluster label")
121 130
122 plt.suptitle("Kmeans Clustering") 131 plt.suptitle("Kmeans Clustering")
123 132
124 loc = os.path.join( 133 loc = os.path.join(
125 outdir, 134 outdir,
126 args.prefix + "kmeans.pdf" 135 args.prefix + "kmeans.pdf"
127 ) 136 )
128 plt.savefig(loc, bbox_inches="tight") 137 plt.savefig(loc, bbox_inches="tight")
129 plt.close() 138 plt.close()
130 139
131 print("INFO: figure saved at {}".format(loc)) 140 print("INFO: figure saved at {}".format(loc))
132 141
133 end = time.time() 142 end = time.time()
134 print("program ended in {0:.2f} seconds".format(end-start)) 143 print("program ended in {0:.2f} seconds".format(end-start))
135 return { 144 return {
136 "f1": np.mean(fscores), 145 "f1": np.mean(fscores),
137 "entropy": entropy, 146 "entropy": entropy,
138 "homogeneity": homogeneity, 147 "homogeneity": homogeneity,
139 "completeness": completeness, 148 "completeness": completeness,
140 "v-measure": v_measure, 149 "v-measure": v_measure,
141 "purity_class_score": purity_class_score, 150 "purity_class_score": purity_class_score,
142 "purity_cluster score": purity_cluster_score, 151 "purity_cluster score": purity_cluster_score,
143 "K": K 152 "K": K
144 } 153 }
145 154
146 155
147 if __name__ == "__main__": 156 if __name__ == "__main__":
148 # Argparse 157 # Argparse
149 parser = argparse.ArgumentParser("Compute clustering on a latent space") 158 parser = argparse.ArgumentParser("Compute clustering on a latent space")
150 parser.add_argument("features") 159 parser.add_argument("features")
151 parser.add_argument("utt2", 160 parser.add_argument("utt2",
152 type=str, 161 type=str,
153 help="file with [utt] [value]") 162 help="file with [utt] [value]")
154 parser.add_argument("--idsfrom", 163 parser.add_argument("--idsfrom",
155 type=str, 164 type=str,
156 default="utt2", 165 default="utt2",
157 choices=[ 166 choices=[
158 "features", 167 "features",
159 "utt2" 168 "utt2"
160 ], 169 ],
161 help="from features or from utt2?") 170 help="from features or from utt2?")
162 parser.add_argument("--prefix", 171 parser.add_argument("--prefix",
163 default="", 172 default="",
164 type=str, 173 type=str,
165 help="prefix of saved files") 174 help="prefix of saved files")
166 parser.add_argument("--outdir", 175 parser.add_argument("--outdir",
167 default=None, 176 default=None,
168 type=str, 177 type=str,
169 help="Output directory") 178 help="Output directory")
170 parser.add_argument("--nmodels", 179 parser.add_argument("--nmodels",
171 type=int, 180 type=int,
172 default=1, 181 default=1,
173 help="specifies the number of models to train") 182 help="specifies the number of models to train")
174 args = parser.parse_args() 183 args = parser.parse_args()
175 184
176 assert args.outdir 185 assert args.outdir
177 186
178 start = time.time() 187 start = time.time()
179 188
180 # Load features and utt2 189 # Load features and utt2
181 features = read_features(args.features) 190 features = read_features(args.features)
182 utt2 = read_lst(args.utt2) 191 utt2 = read_lst(args.utt2)
183 192
184 # Take id list 193 # Take id list
185 if args.idsfrom == "features": 194 if args.idsfrom == "features":
186 ids = list(features.keys()) 195 ids = list(features.keys())
187 elif args.idsfrom == "utt2": 196 elif args.idsfrom == "utt2":
188 ids = list(utt2.keys()) 197 ids = list(utt2.keys())
189 else: 198 else:
190 print(f"idsfrom is not good: {args.idsfrom}") 199 print(f"idsfrom is not good: {args.idsfrom}")
191 exit(1) 200 exit(1)
192 201
193 feats = np.vstack([ features[id_] for id_ in ids ]) 202 feats = np.vstack([ features[id_] for id_ in ids ])
194 classes = [ utt2[id_] for id_ in ids ] 203 classes = [ utt2[id_] for id_ in ids ]
195 204
196 # Encode labels 205 # Encode labels
197 le = LabelEncoder() 206 le = LabelEncoder()
198 labels = le.fit_transform(classes) 207 labels = le.fit_transform(classes)
199 208
200 measures = {} 209 measures = {}
201 for i in range(1, args.nmodels+1): 210 for i in range(1, args.nmodels+1):
202 subdir = os.path.join(args.outdir, str(i)) 211 subdir = os.path.join(args.outdir, str(i))
203 if not os.path.exists(subdir): 212 if not os.path.exists(subdir):
204 os.mkdir(subdir) 213 os.mkdir(subdir)
205 print(f"[{i}/{args.nmodels}] => {subdir}") 214 print(f"[{i}/{args.nmodels}] => {subdir}")
206 results = train_clustering(le, feats, classes, subdir) 215 results = train_clustering(le, feats, classes, subdir)
207 216
208 for key, value in results.items(): 217 for key, value in results.items():
209 if key not in measures: 218 if key not in measures:
210 measures[key] = [] 219 measures[key] = []
211 measures[key].append(results[key]) 220 measures[key].append(results[key])
212 221
213 222
214 # File with results 223 # File with results
215 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") 224 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
216 225
217 with open(file_results, "w") as f: 226 with open(file_results, "w") as f:
218 f.write(f"[nmodels: {args.nmodels}]\n") 227 f.write(f"[nmodels: {args.nmodels}]\n")
219 for key in measures.keys(): 228 for key in measures.keys():
220 values = np.asarray(measures[key], dtype=float) 229 values = np.asarray(measures[key], dtype=float)
221 mean = np.mean(values) 230 mean = np.mean(values)
222 std = np.std(values) 231 std = np.std(values)
223 f.write(f"[{key} => mean: {mean}, std: {std}] \n") 232 f.write(f"[{key} => mean: {mean}, std: {std}] \n")
224 233
225 # CSV File with all the values 234 # CSV File with all the values
226 file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv") 235 file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
227 236
228 with open(file_csv_measures, "w", newline="") as f: 237 with open(file_csv_measures, "w", newline="") as f:
229 writer = csv.writer(f, delimiter=",") 238 writer = csv.writer(f, delimiter=",")
230 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) 239 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
231 for key in measures.keys(): 240 for key in measures.keys():
232 values = np.asarray(measures[key], dtype=float) 241 values = np.asarray(measures[key], dtype=float)