Commit 15b183a24d4fa0c2e97c2239e8a132470a6749b1

Authored by Mathias
1 parent 503bfd9274
Exists in master

Add purity measure to the script

Showing 1 changed file with 15 additions and 2 deletions Inline Diff

scripts/evaluations/clustering.py
1 ''' 1 '''
2 This script allows the user to evaluate a classification system on new labels using clustering methods. 2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding). 3 The algorithms are applied on the given latent space (embedding).
4 ''' 4 '''
5 import argparse 5 import argparse
6 import numpy as np 6 import numpy as np
7 import pandas as pd 7 import pandas as pd
8 import os 8 import os
9 import time 9 import time
10 import pickle 10 import pickle
11 import csv 11 import csv
12 12
13 from sklearn.preprocessing import LabelEncoder 13 from sklearn.preprocessing import LabelEncoder
14 from sklearn.metrics.pairwise import pairwise_distances 14 from sklearn.metrics.pairwise import pairwise_distances
15 from sklearn.cluster import KMeans 15 from sklearn.cluster import KMeans
16 from sklearn.manifold import TSNE 16 from sklearn.manifold import TSNE
17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score 17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
18 import matplotlib.pyplot as plt 18 import matplotlib.pyplot as plt
19 19
20 from volia.data_io import read_features,read_lst 20 from volia.data_io import read_features,read_lst
21 from volia.measures import entropy_score 21 from volia.measures import entropy_score, purity_score
22 22
23 ''' 23 '''
24 TODO: 24 TODO:
25 - Add an option allowing the user to choose the number of 25 - Add an option allowing the user to choose the number of
26 clustering to train in order to compute the average and the 26 clustering to train in order to compute the average and the
27 ''' 27 '''
28 28
29 29
30 def train_clustering(label_encoder, feats, classes, outdir): 30 def train_clustering(label_encoder, feats, classes, outdir):
31 num_classes = len(label_encoder.classes_) 31 num_classes = len(label_encoder.classes_)
32 32
33 # Compute KMEANS clustering on data 33 # Compute KMEANS clustering on data
34 estimator = KMeans( 34 estimator = KMeans(
35 n_clusters=num_classes, 35 n_clusters=num_classes,
36 n_init=100, 36 n_init=100,
37 tol=10-6, 37 tol=10-6,
38 algorithm="elkan" 38 algorithm="elkan"
39 ) 39 )
40 estimator.fit(feats) 40 estimator.fit(feats)
41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") 41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
42 42
43 with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: 43 with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f:
44 pickle.dump(estimator, f) 44 pickle.dump(estimator, f)
45 45
46 # contains distance to each cluster for each sample 46 # contains distance to each cluster for each sample
47 dist_space = estimator.transform(feats) 47 dist_space = estimator.transform(feats)
48 predictions = np.argmin(dist_space, axis=1) 48 predictions = np.argmin(dist_space, axis=1)
49 49
50 # gives each cluster a name (considering most represented character) 50 # gives each cluster a name (considering most represented character)
51 dataframe = pd.DataFrame({ 51 dataframe = pd.DataFrame({
52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), 52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
53 "prediction": pd.Series(predictions) 53 "prediction": pd.Series(predictions)
54 }) 54 })
55 55
56 def find_cluster_name_fn(c): 56 def find_cluster_name_fn(c):
57 mask = dataframe["prediction"] == c 57 mask = dataframe["prediction"] == c
58 return dataframe[mask]["label"].value_counts(sort=False).idxmax() 58 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
59 59
60 cluster_names = list(map(find_cluster_name_fn, range(num_classes))) 60 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
61 predicted_labels = le.transform( 61 predicted_labels = le.transform(
62 [cluster_names[pred] for pred in predictions]) 62 [cluster_names[pred] for pred in predictions])
63 63
64 # F-measure 64 # F-measure
65 fscores = f1_score(labels, predicted_labels, average=None) 65 fscores = f1_score(labels, predicted_labels, average=None)
66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) 66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
67 67
68 # Entropy 68 # Entropy
69 _, _, entropy = entropy_score(labels, predicted_labels) 69 _, _, entropy = entropy_score(labels, predicted_labels)
70 70
71 # Homogenity 71 # Homogenity
72 homogeneity = homogeneity_score(labels, predicted_labels) 72 homogeneity = homogeneity_score(labels, predicted_labels)
73 73
74 # Completeness 74 # Completeness
75 completeness = completeness_score(labels, predicted_labels) 75 completeness = completeness_score(labels, predicted_labels)
76 76
77 # V-Measure 77 # V-Measure
78 v_measure = v_measure_score(labels, predicted_labels) 78 v_measure = v_measure_score(labels, predicted_labels)
79 79
80 # Purity
81 purity_scores = purity_score(labels, predicted_labels)
82 purity_class_score = purity_scores["purity_class_score"]
83 purity_cluster_score = purity_scores["purity_cluster_score"]
84 K = purity_scores["K"]
85
80 # Write results 86 # Write results
81 with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: 87 with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd:
82 print(f"F1-scores for each classes:\n{fscores_str}", file=fd) 88 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
83 print(f"Entropy: {entropy}", file=fd) 89 print(f"Entropy: {entropy}", file=fd)
84 print(f"Global score : {np.mean(fscores)}", file=fd) 90 print(f"Global score : {np.mean(fscores)}", file=fd)
85 print(f"Homogeneity: {homogeneity}", file=fd) 91 print(f"Homogeneity: {homogeneity}", file=fd)
86 print(f"completeness: {completeness}", file=fd) 92 print(f"completeness: {completeness}", file=fd)
87 print(f"v-measure: {v_measure}", file=fd) 93 print(f"v-measure: {v_measure}", file=fd)
94 print(f"purity class score: {purity_class_score}", file=fd)
95 print(f"purity cluster score: {purity_cluster_score}", file=fd)
96 print(f"purity overall evaluation criterion (K): {K}", file=fd)
88 97
89 # Process t-SNE and plot 98 # Process t-SNE and plot
90 tsne_estimator = TSNE() 99 tsne_estimator = TSNE()
91 embeddings = tsne_estimator.fit_transform(feats) 100 embeddings = tsne_estimator.fit_transform(feats)
92 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( 101 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
93 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) 102 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
94 103
95 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) 104 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
96 for c, name in enumerate(le.classes_): 105 for c, name in enumerate(le.classes_):
97 c_mask = np.where(labels == c) 106 c_mask = np.where(labels == c)
98 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 107 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
99 108
100 try: 109 try:
101 id_cluster = cluster_names.index(name) 110 id_cluster = cluster_names.index(name)
102 except ValueError: 111 except ValueError:
103 print("WARNING: no cluster found for {}".format(name)) 112 print("WARNING: no cluster found for {}".format(name))
104 continue 113 continue
105 c_mask = np.where(predictions == id_cluster) 114 c_mask = np.where(predictions == id_cluster)
106 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 115 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
107 116
108 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 117 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
109 axe1.set_title("true labels") 118 axe1.set_title("true labels")
110 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 119 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
111 axe2.set_title("predicted cluster label") 120 axe2.set_title("predicted cluster label")
112 121
113 plt.suptitle("Kmeans Clustering") 122 plt.suptitle("Kmeans Clustering")
114 123
115 loc = os.path.join( 124 loc = os.path.join(
116 outdir, 125 outdir,
117 args.prefix + "kmeans.pdf" 126 args.prefix + "kmeans.pdf"
118 ) 127 )
119 plt.savefig(loc, bbox_inches="tight") 128 plt.savefig(loc, bbox_inches="tight")
120 plt.close() 129 plt.close()
121 130
122 print("INFO: figure saved at {}".format(loc)) 131 print("INFO: figure saved at {}".format(loc))
123 132
124 end = time.time() 133 end = time.time()
125 print("program ended in {0:.2f} seconds".format(end-start)) 134 print("program ended in {0:.2f} seconds".format(end-start))
126 return { 135 return {
127 "f1": np.mean(fscores), 136 "f1": np.mean(fscores),
128 "entropy": entropy, 137 "entropy": entropy,
129 "homogeneity": homogeneity, 138 "homogeneity": homogeneity,
130 "completeness": completeness, 139 "completeness": completeness,
131 "v-measure": v_measure 140 "v-measure": v_measure,
141 "purity_class_score": purity_class_score,
142 "purity_cluster score": purity_cluster_score,
143 "K": K
132 } 144 }
145
133 146
134 if __name__ == "__main__": 147 if __name__ == "__main__":
135 # Argparse 148 # Argparse
136 parser = argparse.ArgumentParser("Compute clustering on a latent space") 149 parser = argparse.ArgumentParser("Compute clustering on a latent space")
137 parser.add_argument("features") 150 parser.add_argument("features")
138 parser.add_argument("utt2", 151 parser.add_argument("utt2",
139 type=str, 152 type=str,
140 help="file with [utt] [value]") 153 help="file with [utt] [value]")
141 parser.add_argument("--idsfrom", 154 parser.add_argument("--idsfrom",
142 type=str, 155 type=str,
143 default="utt2", 156 default="utt2",
144 choices=[ 157 choices=[
145 "features", 158 "features",
146 "utt2" 159 "utt2"
147 ], 160 ],
148 help="from features or from utt2?") 161 help="from features or from utt2?")
149 parser.add_argument("--prefix", 162 parser.add_argument("--prefix",
150 default="", 163 default="",
151 type=str, 164 type=str,
152 help="prefix of saved files") 165 help="prefix of saved files")
153 parser.add_argument("--outdir", 166 parser.add_argument("--outdir",
154 default=None, 167 default=None,
155 type=str, 168 type=str,
156 help="Output directory") 169 help="Output directory")
157 parser.add_argument("--nmodels", 170 parser.add_argument("--nmodels",
158 type=int, 171 type=int,
159 default=1, 172 default=1,
160 help="specifies the number of models to train") 173 help="specifies the number of models to train")
161 args = parser.parse_args() 174 args = parser.parse_args()
162 175
163 assert args.outdir 176 assert args.outdir
164 177
165 start = time.time() 178 start = time.time()
166 179
167 # Load features and utt2 180 # Load features and utt2
168 features = read_features(args.features) 181 features = read_features(args.features)
169 utt2 = read_lst(args.utt2) 182 utt2 = read_lst(args.utt2)
170 183
171 # Take id list 184 # Take id list
172 if args.idsfrom == "features": 185 if args.idsfrom == "features":
173 ids = list(features.keys()) 186 ids = list(features.keys())
174 elif args.idsfrom == "utt2": 187 elif args.idsfrom == "utt2":
175 ids = list(utt2.keys()) 188 ids = list(utt2.keys())
176 else: 189 else:
177 print(f"idsfrom is not good: {args.idsfrom}") 190 print(f"idsfrom is not good: {args.idsfrom}")
178 exit(1) 191 exit(1)
179 192
180 feats = np.vstack([ features[id_] for id_ in ids ]) 193 feats = np.vstack([ features[id_] for id_ in ids ])
181 classes = [ utt2[id_] for id_ in ids ] 194 classes = [ utt2[id_] for id_ in ids ]
182 195
183 # Encode labels 196 # Encode labels
184 le = LabelEncoder() 197 le = LabelEncoder()
185 labels = le.fit_transform(classes) 198 labels = le.fit_transform(classes)
186 199
187 measures = {} 200 measures = {}
188 for i in range(1, args.nmodels+1): 201 for i in range(1, args.nmodels+1):
189 subdir = os.path.join(args.outdir, str(i)) 202 subdir = os.path.join(args.outdir, str(i))
190 if not os.path.exists(subdir): 203 if not os.path.exists(subdir):
191 os.mkdir(subdir) 204 os.mkdir(subdir)
192 print(f"[{i}/{args.nmodels}] => {subdir}") 205 print(f"[{i}/{args.nmodels}] => {subdir}")
193 results = train_clustering(le, feats, classes, subdir) 206 results = train_clustering(le, feats, classes, subdir)
194 207
195 for key, value in results.items(): 208 for key, value in results.items():
196 if key not in measures: 209 if key not in measures:
197 measures[key] = [] 210 measures[key] = []
198 measures[key].append(results[key]) 211 measures[key].append(results[key])
199 212
200 213
201 # File with results 214 # File with results
202 file_results = os.path.join(args.outdir, "clustering_measures.txt") 215 file_results = os.path.join(args.outdir, "clustering_measures.txt")
203 216
204 with open(file_results, "w") as f: 217 with open(file_results, "w") as f:
205 f.write(f"[nmodels: {args.nmodels}]\n") 218 f.write(f"[nmodels: {args.nmodels}]\n")
206 for key in measures.keys(): 219 for key in measures.keys():
207 values = np.asarray(measures[key], dtype=float) 220 values = np.asarray(measures[key], dtype=float)
208 mean = np.mean(values) 221 mean = np.mean(values)
209 std = np.std(values) 222 std = np.std(values)
210 f.write(f"[{key} => mean: {mean}, std: {std}] \n") 223 f.write(f"[{key} => mean: {mean}, std: {std}] \n")
211 224
212 # CSV File with all the values 225 # CSV File with all the values
213 file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") 226 file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv")
214 227
215 with open(file_csv_measures, "w", newline="") as f: 228 with open(file_csv_measures, "w", newline="") as f:
216 writer = csv.writer(f, delimiter=",") 229 writer = csv.writer(f, delimiter=",")
217 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) 230 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
218 for key in measures.keys(): 231 for key in measures.keys():
219 values = np.asarray(measures[key], dtype=float) 232 values = np.asarray(measures[key], dtype=float)
220 mean = np.mean(values) 233 mean = np.mean(values)
221 std = np.std(values) 234 std = np.std(values)
222 writer.writerow([key] + list(values) + [mean] + [std]) 235 writer.writerow([key] + list(values) + [mean] + [std])