Commit 3b7e63994c7b1b562f19ca10c9c7b3b472483644

Authored by Mathias
1 parent 15b183a24d
Exists in master

Reajust the way to name files using prefix

Showing 1 changed file with 4 additions and 4 deletions Inline Diff

scripts/evaluations/clustering.py
1 ''' 1 '''
2 This script allows the user to evaluate a classification system on new labels using clustering methods. 2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding). 3 The algorithms are applied on the given latent space (embedding).
4 ''' 4 '''
5 import argparse 5 import argparse
6 import numpy as np 6 import numpy as np
7 import pandas as pd 7 import pandas as pd
8 import os 8 import os
9 import time 9 import time
10 import pickle 10 import pickle
11 import csv 11 import csv
12 12
13 from sklearn.preprocessing import LabelEncoder 13 from sklearn.preprocessing import LabelEncoder
14 from sklearn.metrics.pairwise import pairwise_distances 14 from sklearn.metrics.pairwise import pairwise_distances
15 from sklearn.cluster import KMeans 15 from sklearn.cluster import KMeans
16 from sklearn.manifold import TSNE 16 from sklearn.manifold import TSNE
17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score 17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
18 import matplotlib.pyplot as plt 18 import matplotlib.pyplot as plt
19 19
20 from volia.data_io import read_features,read_lst 20 from volia.data_io import read_features,read_lst
21 from volia.measures import entropy_score, purity_score 21 from volia.measures import entropy_score, purity_score
22 22
23 ''' 23 '''
24 TODO: 24 TODO:
25 - Add an option allowing the user to choose the number of 25 - Add an option allowing the user to choose the number of
26 clustering to train in order to compute the average and the 26 clustering to train in order to compute the average and the
27 ''' 27 '''
28 28
29 29
30 def train_clustering(label_encoder, feats, classes, outdir): 30 def train_clustering(label_encoder, feats, classes, outdir):
31 num_classes = len(label_encoder.classes_) 31 num_classes = len(label_encoder.classes_)
32 32
33 # Compute KMEANS clustering on data 33 # Compute KMEANS clustering on data
34 estimator = KMeans( 34 estimator = KMeans(
35 n_clusters=num_classes, 35 n_clusters=num_classes,
36 n_init=100, 36 n_init=100,
37 tol=10-6, 37 tol=10-6,
38 algorithm="elkan" 38 algorithm="elkan"
39 ) 39 )
40 estimator.fit(feats) 40 estimator.fit(feats)
41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") 41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
42 42
43 with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: 43 with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f:
44 pickle.dump(estimator, f) 44 pickle.dump(estimator, f)
45 45
46 # contains distance to each cluster for each sample 46 # contains distance to each cluster for each sample
47 dist_space = estimator.transform(feats) 47 dist_space = estimator.transform(feats)
48 predictions = np.argmin(dist_space, axis=1) 48 predictions = np.argmin(dist_space, axis=1)
49 49
50 # gives each cluster a name (considering most represented character) 50 # gives each cluster a name (considering most represented character)
51 dataframe = pd.DataFrame({ 51 dataframe = pd.DataFrame({
52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), 52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
53 "prediction": pd.Series(predictions) 53 "prediction": pd.Series(predictions)
54 }) 54 })
55 55
56 def find_cluster_name_fn(c): 56 def find_cluster_name_fn(c):
57 mask = dataframe["prediction"] == c 57 mask = dataframe["prediction"] == c
58 return dataframe[mask]["label"].value_counts(sort=False).idxmax() 58 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
59 59
60 cluster_names = list(map(find_cluster_name_fn, range(num_classes))) 60 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
61 predicted_labels = le.transform( 61 predicted_labels = le.transform(
62 [cluster_names[pred] for pred in predictions]) 62 [cluster_names[pred] for pred in predictions])
63 63
64 # F-measure 64 # F-measure
65 fscores = f1_score(labels, predicted_labels, average=None) 65 fscores = f1_score(labels, predicted_labels, average=None)
66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) 66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
67 67
68 # Entropy 68 # Entropy
69 _, _, entropy = entropy_score(labels, predicted_labels) 69 _, _, entropy = entropy_score(labels, predicted_labels)
70 70
71 # Homogenity 71 # Homogenity
72 homogeneity = homogeneity_score(labels, predicted_labels) 72 homogeneity = homogeneity_score(labels, predicted_labels)
73 73
74 # Completeness 74 # Completeness
75 completeness = completeness_score(labels, predicted_labels) 75 completeness = completeness_score(labels, predicted_labels)
76 76
77 # V-Measure 77 # V-Measure
78 v_measure = v_measure_score(labels, predicted_labels) 78 v_measure = v_measure_score(labels, predicted_labels)
79 79
80 # Purity 80 # Purity
81 purity_scores = purity_score(labels, predicted_labels) 81 purity_scores = purity_score(labels, predicted_labels)
82 purity_class_score = purity_scores["purity_class_score"] 82 purity_class_score = purity_scores["purity_class_score"]
83 purity_cluster_score = purity_scores["purity_cluster_score"] 83 purity_cluster_score = purity_scores["purity_cluster_score"]
84 K = purity_scores["K"] 84 K = purity_scores["K"]
85 85
86 # Write results 86 # Write results
87 with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: 87 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
88 print(f"F1-scores for each classes:\n{fscores_str}", file=fd) 88 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
89 print(f"Entropy: {entropy}", file=fd) 89 print(f"Entropy: {entropy}", file=fd)
90 print(f"Global score : {np.mean(fscores)}", file=fd) 90 print(f"Global score : {np.mean(fscores)}", file=fd)
91 print(f"Homogeneity: {homogeneity}", file=fd) 91 print(f"Homogeneity: {homogeneity}", file=fd)
92 print(f"completeness: {completeness}", file=fd) 92 print(f"completeness: {completeness}", file=fd)
93 print(f"v-measure: {v_measure}", file=fd) 93 print(f"v-measure: {v_measure}", file=fd)
94 print(f"purity class score: {purity_class_score}", file=fd) 94 print(f"purity class score: {purity_class_score}", file=fd)
95 print(f"purity cluster score: {purity_cluster_score}", file=fd) 95 print(f"purity cluster score: {purity_cluster_score}", file=fd)
96 print(f"purity overall evaluation criterion (K): {K}", file=fd) 96 print(f"purity overall evaluation criterion (K): {K}", file=fd)
97 97
98 # Process t-SNE and plot 98 # Process t-SNE and plot
99 tsne_estimator = TSNE() 99 tsne_estimator = TSNE()
100 embeddings = tsne_estimator.fit_transform(feats) 100 embeddings = tsne_estimator.fit_transform(feats)
101 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( 101 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
102 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) 102 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
103 103
104 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) 104 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
105 for c, name in enumerate(le.classes_): 105 for c, name in enumerate(le.classes_):
106 c_mask = np.where(labels == c) 106 c_mask = np.where(labels == c)
107 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 107 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
108 108
109 try: 109 try:
110 id_cluster = cluster_names.index(name) 110 id_cluster = cluster_names.index(name)
111 except ValueError: 111 except ValueError:
112 print("WARNING: no cluster found for {}".format(name)) 112 print("WARNING: no cluster found for {}".format(name))
113 continue 113 continue
114 c_mask = np.where(predictions == id_cluster) 114 c_mask = np.where(predictions == id_cluster)
115 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 115 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
116 116
117 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 117 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
118 axe1.set_title("true labels") 118 axe1.set_title("true labels")
119 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 119 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
120 axe2.set_title("predicted cluster label") 120 axe2.set_title("predicted cluster label")
121 121
122 plt.suptitle("Kmeans Clustering") 122 plt.suptitle("Kmeans Clustering")
123 123
124 loc = os.path.join( 124 loc = os.path.join(
125 outdir, 125 outdir,
126 args.prefix + "kmeans.pdf" 126 args.prefix + "kmeans.pdf"
127 ) 127 )
128 plt.savefig(loc, bbox_inches="tight") 128 plt.savefig(loc, bbox_inches="tight")
129 plt.close() 129 plt.close()
130 130
131 print("INFO: figure saved at {}".format(loc)) 131 print("INFO: figure saved at {}".format(loc))
132 132
133 end = time.time() 133 end = time.time()
134 print("program ended in {0:.2f} seconds".format(end-start)) 134 print("program ended in {0:.2f} seconds".format(end-start))
135 return { 135 return {
136 "f1": np.mean(fscores), 136 "f1": np.mean(fscores),
137 "entropy": entropy, 137 "entropy": entropy,
138 "homogeneity": homogeneity, 138 "homogeneity": homogeneity,
139 "completeness": completeness, 139 "completeness": completeness,
140 "v-measure": v_measure, 140 "v-measure": v_measure,
141 "purity_class_score": purity_class_score, 141 "purity_class_score": purity_class_score,
142 "purity_cluster score": purity_cluster_score, 142 "purity_cluster score": purity_cluster_score,
143 "K": K 143 "K": K
144 } 144 }
145 145
146 146
147 if __name__ == "__main__": 147 if __name__ == "__main__":
148 # Argparse 148 # Argparse
149 parser = argparse.ArgumentParser("Compute clustering on a latent space") 149 parser = argparse.ArgumentParser("Compute clustering on a latent space")
150 parser.add_argument("features") 150 parser.add_argument("features")
151 parser.add_argument("utt2", 151 parser.add_argument("utt2",
152 type=str, 152 type=str,
153 help="file with [utt] [value]") 153 help="file with [utt] [value]")
154 parser.add_argument("--idsfrom", 154 parser.add_argument("--idsfrom",
155 type=str, 155 type=str,
156 default="utt2", 156 default="utt2",
157 choices=[ 157 choices=[
158 "features", 158 "features",
159 "utt2" 159 "utt2"
160 ], 160 ],
161 help="from features or from utt2?") 161 help="from features or from utt2?")
162 parser.add_argument("--prefix", 162 parser.add_argument("--prefix",
163 default="", 163 default="",
164 type=str, 164 type=str,
165 help="prefix of saved files") 165 help="prefix of saved files")
166 parser.add_argument("--outdir", 166 parser.add_argument("--outdir",
167 default=None, 167 default=None,
168 type=str, 168 type=str,
169 help="Output directory") 169 help="Output directory")
170 parser.add_argument("--nmodels", 170 parser.add_argument("--nmodels",
171 type=int, 171 type=int,
172 default=1, 172 default=1,
173 help="specifies the number of models to train") 173 help="specifies the number of models to train")
174 args = parser.parse_args() 174 args = parser.parse_args()
175 175
176 assert args.outdir 176 assert args.outdir
177 177
178 start = time.time() 178 start = time.time()
179 179
180 # Load features and utt2 180 # Load features and utt2
181 features = read_features(args.features) 181 features = read_features(args.features)
182 utt2 = read_lst(args.utt2) 182 utt2 = read_lst(args.utt2)
183 183
184 # Take id list 184 # Take id list
185 if args.idsfrom == "features": 185 if args.idsfrom == "features":
186 ids = list(features.keys()) 186 ids = list(features.keys())
187 elif args.idsfrom == "utt2": 187 elif args.idsfrom == "utt2":
188 ids = list(utt2.keys()) 188 ids = list(utt2.keys())
189 else: 189 else:
190 print(f"idsfrom is not good: {args.idsfrom}") 190 print(f"idsfrom is not good: {args.idsfrom}")
191 exit(1) 191 exit(1)
192 192
193 feats = np.vstack([ features[id_] for id_ in ids ]) 193 feats = np.vstack([ features[id_] for id_ in ids ])
194 classes = [ utt2[id_] for id_ in ids ] 194 classes = [ utt2[id_] for id_ in ids ]
195 195
196 # Encode labels 196 # Encode labels
197 le = LabelEncoder() 197 le = LabelEncoder()
198 labels = le.fit_transform(classes) 198 labels = le.fit_transform(classes)
199 199
200 measures = {} 200 measures = {}
201 for i in range(1, args.nmodels+1): 201 for i in range(1, args.nmodels+1):
202 subdir = os.path.join(args.outdir, str(i)) 202 subdir = os.path.join(args.outdir, str(i))
203 if not os.path.exists(subdir): 203 if not os.path.exists(subdir):
204 os.mkdir(subdir) 204 os.mkdir(subdir)
205 print(f"[{i}/{args.nmodels}] => {subdir}") 205 print(f"[{i}/{args.nmodels}] => {subdir}")
206 results = train_clustering(le, feats, classes, subdir) 206 results = train_clustering(le, feats, classes, subdir)
207 207
208 for key, value in results.items(): 208 for key, value in results.items():
209 if key not in measures: 209 if key not in measures:
210 measures[key] = [] 210 measures[key] = []
211 measures[key].append(results[key]) 211 measures[key].append(results[key])
212 212
213 213
214 # File with results 214 # File with results
215 file_results = os.path.join(args.outdir, "clustering_measures.txt") 215 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
216 216
217 with open(file_results, "w") as f: 217 with open(file_results, "w") as f:
218 f.write(f"[nmodels: {args.nmodels}]\n") 218 f.write(f"[nmodels: {args.nmodels}]\n")
219 for key in measures.keys(): 219 for key in measures.keys():
220 values = np.asarray(measures[key], dtype=float) 220 values = np.asarray(measures[key], dtype=float)
221 mean = np.mean(values) 221 mean = np.mean(values)
222 std = np.std(values) 222 std = np.std(values)
223 f.write(f"[{key} => mean: {mean}, std: {std}] \n") 223 f.write(f"[{key} => mean: {mean}, std: {std}] \n")
224 224
225 # CSV File with all the values 225 # CSV File with all the values
226 file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") 226 file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
227 227
228 with open(file_csv_measures, "w", newline="") as f: 228 with open(file_csv_measures, "w", newline="") as f:
229 writer = csv.writer(f, delimiter=",") 229 writer = csv.writer(f, delimiter=",")
230 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) 230 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
231 for key in measures.keys(): 231 for key in measures.keys():
232 values = np.asarray(measures[key], dtype=float) 232 values = np.asarray(measures[key], dtype=float)
233 mean = np.mean(values) 233 mean = np.mean(values)
234 std = np.std(values) 234 std = np.std(values)
235 writer.writerow([key] + list(values) + [mean] + [std]) 235 writer.writerow([key] + list(values) + [mean] + [std])